use std::collections::HashMap;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
use cyanea_core::{CyaneaError, Result};
const FLAG_UNMAPPED: u16 = 0x4;
const FLAG_PAIRED: u16 = 0x1;
const FLAG_PROPER_PAIR: u16 = 0x2;
const FLAG_MATE_UNMAPPED: u16 = 0x8;
const FLAG_REVERSE: u16 = 0x10;
const FLAG_MATE_REVERSE: u16 = 0x20;
const FLAG_FIRST_IN_PAIR: u16 = 0x40;
const FLAG_SECOND_IN_PAIR: u16 = 0x80;
const FLAG_SECONDARY: u16 = 0x100;
const FLAG_SUPPLEMENTARY: u16 = 0x800;
#[derive(Debug, Clone)]
pub struct SamRecord {
pub qname: String,
pub flag: u16,
pub rname: String,
pub pos: u64,
pub mapq: u8,
pub cigar: String,
pub rnext: String,
pub pnext: u64,
pub tlen: i64,
pub sequence: String,
pub quality: String,
}
impl SamRecord {
pub fn is_mapped(&self) -> bool {
self.flag & FLAG_UNMAPPED == 0
}
pub fn is_unmapped(&self) -> bool {
self.flag & FLAG_UNMAPPED != 0
}
pub fn is_paired(&self) -> bool {
self.flag & FLAG_PAIRED != 0
}
pub fn is_proper_pair(&self) -> bool {
self.flag & FLAG_PROPER_PAIR != 0
}
pub fn is_mate_unmapped(&self) -> bool {
self.flag & FLAG_MATE_UNMAPPED != 0
}
pub fn is_reverse(&self) -> bool {
self.flag & FLAG_REVERSE != 0
}
pub fn is_mate_reverse(&self) -> bool {
self.flag & FLAG_MATE_REVERSE != 0
}
pub fn is_first_in_pair(&self) -> bool {
self.flag & FLAG_FIRST_IN_PAIR != 0
}
pub fn is_second_in_pair(&self) -> bool {
self.flag & FLAG_SECOND_IN_PAIR != 0
}
pub fn is_secondary(&self) -> bool {
self.flag & FLAG_SECONDARY != 0
}
pub fn is_supplementary(&self) -> bool {
self.flag & FLAG_SUPPLEMENTARY != 0
}
pub fn seq_len(&self) -> usize {
if self.sequence == "*" {
0
} else {
self.sequence.len()
}
}
}
#[derive(Debug, Clone)]
pub struct SamStats {
pub total_reads: usize,
pub mapped: usize,
pub unmapped: usize,
pub avg_mapq: f64,
pub avg_length: f64,
pub mapq_distribution: Vec<(u8, usize)>,
}
pub fn parse_sam_str(text: &str) -> Result<Vec<SamRecord>> {
let cursor = std::io::Cursor::new(text.as_bytes());
let reader = BufReader::new(cursor);
parse_sam_reader(reader, Path::new("<string>"))
}
pub fn parse_sam(path: impl AsRef<Path>) -> Result<Vec<SamRecord>> {
let path = path.as_ref();
let file = File::open(path).map_err(|e| {
CyaneaError::Io(std::io::Error::new(
e.kind(),
format!("{}: {}", path.display(), e),
))
})?;
let reader = BufReader::new(file);
parse_sam_reader(reader, path)
}
fn parse_sam_reader(reader: impl BufRead, path: &Path) -> Result<Vec<SamRecord>> {
let mut data_lines: Vec<(usize, String)> = Vec::new();
for (line_num, line_result) in reader.lines().enumerate() {
let line = line_result.map_err(|e| {
CyaneaError::Io(std::io::Error::new(
e.kind(),
format!("{}: line {}: {}", path.display(), line_num + 1, e),
))
})?;
let trimmed = line.trim().to_string();
if trimmed.is_empty() || trimmed.starts_with('@') {
continue;
}
data_lines.push((line_num + 1, trimmed));
}
#[cfg(feature = "parallel")]
{
use rayon::prelude::*;
data_lines
.par_iter()
.map(|(line_num, line)| parse_sam_record(line, *line_num, path))
.collect()
}
#[cfg(not(feature = "parallel"))]
data_lines
.iter()
.map(|(line_num, line)| parse_sam_record(line, *line_num, path))
.collect()
}
fn parse_sam_record(line: &str, line_num: usize, path: &Path) -> Result<SamRecord> {
let fields: Vec<&str> = line.split('\t').collect();
if fields.len() < 11 {
return Err(CyaneaError::Parse(format!(
"{}: line {}: expected at least 11 tab-separated columns, found {}",
path.display(),
line_num,
fields.len()
)));
}
let qname = fields[0].to_string();
let flag: u16 = fields[1].parse().map_err(|_| {
CyaneaError::Parse(format!(
"{}: line {}: invalid FLAG '{}'",
path.display(),
line_num,
fields[1]
))
})?;
let rname = fields[2].to_string();
let pos: u64 = fields[3].parse().map_err(|_| {
CyaneaError::Parse(format!(
"{}: line {}: invalid POS '{}'",
path.display(),
line_num,
fields[3]
))
})?;
let mapq: u8 = fields[4].parse().map_err(|_| {
CyaneaError::Parse(format!(
"{}: line {}: invalid MAPQ '{}'",
path.display(),
line_num,
fields[4]
))
})?;
let cigar = fields[5].to_string();
let rnext = fields[6].to_string();
let pnext: u64 = fields[7].parse().map_err(|_| {
CyaneaError::Parse(format!(
"{}: line {}: invalid PNEXT '{}'",
path.display(),
line_num,
fields[7]
))
})?;
let tlen: i64 = fields[8].parse().map_err(|_| {
CyaneaError::Parse(format!(
"{}: line {}: invalid TLEN '{}'",
path.display(),
line_num,
fields[8]
))
})?;
let sequence = fields[9].to_string();
let quality = fields[10].to_string();
Ok(SamRecord {
qname,
flag,
rname,
pos,
mapq,
cigar,
rnext,
pnext,
tlen,
sequence,
quality,
})
}
pub fn sam_stats(records: &[SamRecord]) -> SamStats {
let total_reads = records.len();
let mut mapped: usize = 0;
let mut unmapped: usize = 0;
let mut mapq_sum: u64 = 0;
let mut length_sum: u64 = 0;
let mut mapq_counts = [0usize; 256];
for record in records {
length_sum += record.seq_len() as u64;
if record.is_mapped() {
mapped += 1;
mapq_sum += record.mapq as u64;
mapq_counts[record.mapq as usize] += 1;
} else {
unmapped += 1;
}
}
let avg_mapq = if mapped > 0 {
mapq_sum as f64 / mapped as f64
} else {
0.0
};
let avg_length = if total_reads > 0 {
length_sum as f64 / total_reads as f64
} else {
0.0
};
let mapq_distribution: Vec<(u8, usize)> = mapq_counts
.iter()
.enumerate()
.filter(|(_, &count)| count > 0)
.map(|(val, &count)| (val as u8, count))
.collect();
SamStats {
total_reads,
mapped,
unmapped,
avg_mapq,
avg_length,
mapq_distribution,
}
}
pub fn sam_stats_from_path(path: impl AsRef<Path>) -> Result<SamStats> {
let path = path.as_ref();
let file = File::open(path).map_err(|e| {
CyaneaError::Io(std::io::Error::new(
e.kind(),
format!("{}: {}", path.display(), e),
))
})?;
let reader = BufReader::new(file);
let mut total_reads: usize = 0;
let mut mapped: usize = 0;
let mut unmapped: usize = 0;
let mut mapq_sum: u64 = 0;
let mut length_sum: u64 = 0;
let mut mapq_counts = [0usize; 256];
for (line_num, line_result) in reader.lines().enumerate() {
let line = line_result.map_err(CyaneaError::Io)?;
let line = line.trim();
if line.is_empty() || line.starts_with('@') {
continue;
}
let record = parse_sam_record(line, line_num + 1, path)?;
total_reads += 1;
length_sum += record.seq_len() as u64;
if record.is_mapped() {
mapped += 1;
mapq_sum += record.mapq as u64;
mapq_counts[record.mapq as usize] += 1;
} else {
unmapped += 1;
}
}
let avg_mapq = if mapped > 0 {
mapq_sum as f64 / mapped as f64
} else {
0.0
};
let avg_length = if total_reads > 0 {
length_sum as f64 / total_reads as f64
} else {
0.0
};
let mapq_distribution: Vec<(u8, usize)> = mapq_counts
.iter()
.enumerate()
.filter(|(_, &count)| count > 0)
.map(|(val, &count)| (val as u8, count))
.collect();
Ok(SamStats {
total_reads,
mapped,
unmapped,
avg_mapq,
avg_length,
mapq_distribution,
})
}
#[derive(Debug, Clone)]
pub struct SamPair {
pub r1: SamRecord,
pub r2: SamRecord,
}
impl SamPair {
pub fn insert_size(&self) -> i64 {
self.r1.tlen
}
}
#[derive(Debug, Clone)]
pub struct PairedSamStats {
pub base: SamStats,
pub paired_count: usize,
pub proper_pair_count: usize,
pub singletons: usize,
pub avg_insert_size: f64,
}
pub fn pair_sam_records(records: &[SamRecord]) -> Vec<SamPair> {
let mut first_by_name: HashMap<&str, &SamRecord> = HashMap::new();
let mut pairs = Vec::new();
for rec in records {
if rec.is_secondary() || rec.is_supplementary() {
continue;
}
if rec.is_first_in_pair() {
first_by_name.insert(&rec.qname, rec);
}
}
for rec in records {
if rec.is_secondary() || rec.is_supplementary() {
continue;
}
if rec.is_second_in_pair() {
if let Some(r1) = first_by_name.get(rec.qname.as_str()) {
pairs.push(SamPair {
r1: (*r1).clone(),
r2: rec.clone(),
});
}
}
}
pairs
}
pub fn filter_proper_pairs(records: &[SamRecord]) -> Vec<&SamRecord> {
records.iter().filter(|r| r.is_proper_pair()).collect()
}
pub fn paired_sam_stats(records: &[SamRecord]) -> PairedSamStats {
let base = sam_stats(records);
let mut paired_count: usize = 0;
let mut proper_pair_count: usize = 0;
let mut singletons: usize = 0;
let mut insert_sum: i64 = 0;
let mut insert_count: usize = 0;
for rec in records {
if rec.is_paired() {
paired_count += 1;
if rec.is_proper_pair() {
proper_pair_count += 1;
}
if rec.is_mapped() && rec.is_mate_unmapped() {
singletons += 1;
}
if rec.is_first_in_pair() && rec.is_proper_pair() && rec.tlen != 0 {
insert_sum += rec.tlen.abs();
insert_count += 1;
}
}
}
let avg_insert_size = if insert_count > 0 {
insert_sum as f64 / insert_count as f64
} else {
0.0
};
PairedSamStats {
base,
paired_count,
proper_pair_count,
singletons,
avg_insert_size,
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
fn write_sam(content: &str) -> NamedTempFile {
let mut file = NamedTempFile::with_suffix(".sam").unwrap();
write!(file, "{}", content).unwrap();
file.flush().unwrap();
file
}
const BASIC_SAM: &str = "\
@HD\tVN:1.6\tSO:coordinate
@SQ\tSN:chr1\tLN:248956422
@SQ\tSN:chr2\tLN:242193529
read1\t0\tchr1\t100\t60\t50M\t*\t0\t0\tACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC\t*
read2\t16\tchr1\t200\t30\t50M\t*\t0\t0\tACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC\t*
read3\t4\t*\t0\t0\t*\t*\t0\t0\tACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC\t*
";
#[test]
fn test_parse_basic_sam() {
let file = write_sam(BASIC_SAM);
let records = parse_sam(file.path()).unwrap();
assert_eq!(records.len(), 3);
assert_eq!(records[0].qname, "read1");
assert_eq!(records[0].flag, 0);
assert_eq!(records[0].rname, "chr1");
assert_eq!(records[0].pos, 100);
assert_eq!(records[0].mapq, 60);
assert_eq!(records[0].cigar, "50M");
assert_eq!(records[0].seq_len(), 50);
assert!(records[0].is_mapped());
assert!(!records[0].is_unmapped());
assert_eq!(records[1].qname, "read2");
assert_eq!(records[1].flag, 16);
assert_eq!(records[1].rname, "chr1");
assert_eq!(records[1].pos, 200);
assert_eq!(records[1].mapq, 30);
assert!(records[1].is_mapped());
assert_eq!(records[2].qname, "read3");
assert_eq!(records[2].flag, 4);
assert_eq!(records[2].rname, "*");
assert_eq!(records[2].pos, 0);
assert_eq!(records[2].mapq, 0);
assert_eq!(records[2].cigar, "*");
assert!(records[2].is_unmapped());
assert!(!records[2].is_mapped());
}
#[test]
fn test_sam_stats_basic() {
let file = write_sam(BASIC_SAM);
let records = parse_sam(file.path()).unwrap();
let stats = sam_stats(&records);
assert_eq!(stats.total_reads, 3);
assert_eq!(stats.mapped, 2);
assert_eq!(stats.unmapped, 1);
assert!((stats.avg_mapq - 45.0).abs() < f64::EPSILON);
assert!((stats.avg_length - 50.0).abs() < f64::EPSILON);
assert_eq!(stats.mapq_distribution.len(), 2);
assert_eq!(stats.mapq_distribution[0], (30, 1));
assert_eq!(stats.mapq_distribution[1], (60, 1));
}
#[test]
fn test_sam_stats_from_path() {
let file = write_sam(BASIC_SAM);
let stats = sam_stats_from_path(file.path()).unwrap();
assert_eq!(stats.total_reads, 3);
assert_eq!(stats.mapped, 2);
assert_eq!(stats.unmapped, 1);
assert!((stats.avg_mapq - 45.0).abs() < f64::EPSILON);
}
#[test]
fn test_sam_header_only() {
let file = write_sam(
"@HD\tVN:1.6\tSO:coordinate\n\
@SQ\tSN:chr1\tLN:248956422\n",
);
let records = parse_sam(file.path()).unwrap();
assert!(records.is_empty());
}
#[test]
fn test_sam_empty_file() {
let file = write_sam("");
let records = parse_sam(file.path()).unwrap();
assert!(records.is_empty());
}
#[test]
fn test_sam_stats_empty() {
let stats = sam_stats(&[]);
assert_eq!(stats.total_reads, 0);
assert_eq!(stats.mapped, 0);
assert_eq!(stats.unmapped, 0);
assert_eq!(stats.avg_mapq, 0.0);
assert_eq!(stats.avg_length, 0.0);
assert!(stats.mapq_distribution.is_empty());
}
#[test]
fn test_sam_all_unmapped() {
let file = write_sam(
"read1\t4\t*\t0\t0\t*\t*\t0\t0\tACGT\t*\n\
read2\t4\t*\t0\t0\t*\t*\t0\t0\tGGCC\t*\n",
);
let records = parse_sam(file.path()).unwrap();
let stats = sam_stats(&records);
assert_eq!(stats.total_reads, 2);
assert_eq!(stats.mapped, 0);
assert_eq!(stats.unmapped, 2);
assert_eq!(stats.avg_mapq, 0.0); assert!((stats.avg_length - 4.0).abs() < f64::EPSILON);
assert!(stats.mapq_distribution.is_empty());
}
#[test]
fn test_sam_all_mapped() {
let file = write_sam(
"read1\t0\tchr1\t100\t60\t10M\t*\t0\t0\tACGTACGTAC\tIIIIIIIIII\n\
read2\t0\tchr1\t200\t60\t10M\t*\t0\t0\tGCTAGCTAGC\tIIIIIIIIII\n\
read3\t16\tchr2\t300\t40\t10M\t*\t0\t0\tTTTTTTTTTT\tIIIIIIIIII\n",
);
let records = parse_sam(file.path()).unwrap();
let stats = sam_stats(&records);
assert_eq!(stats.total_reads, 3);
assert_eq!(stats.mapped, 3);
assert_eq!(stats.unmapped, 0);
assert!((stats.avg_mapq - 160.0 / 3.0).abs() < 0.001);
assert!((stats.avg_length - 10.0).abs() < f64::EPSILON);
assert_eq!(stats.mapq_distribution.len(), 2);
assert_eq!(stats.mapq_distribution[0], (40, 1));
assert_eq!(stats.mapq_distribution[1], (60, 2));
}
#[test]
fn test_sam_too_few_columns() {
let file = write_sam("read1\t0\tchr1\t100\t60\n");
let result = parse_sam(file.path());
assert!(result.is_err());
let err_msg = format!("{}", result.unwrap_err());
assert!(err_msg.contains("11 tab-separated columns"));
}
#[test]
fn test_sam_invalid_flag() {
let file = write_sam(
"read1\tNOTANUMBER\tchr1\t100\t60\t50M\t*\t0\t0\tACGT\t*\n",
);
let result = parse_sam(file.path());
assert!(result.is_err());
let err_msg = format!("{}", result.unwrap_err());
assert!(err_msg.contains("invalid FLAG"));
}
#[test]
fn test_sam_invalid_pos() {
let file = write_sam(
"read1\t0\tchr1\tXYZ\t60\t50M\t*\t0\t0\tACGT\t*\n",
);
let result = parse_sam(file.path());
assert!(result.is_err());
let err_msg = format!("{}", result.unwrap_err());
assert!(err_msg.contains("invalid POS"));
}
#[test]
fn test_sam_invalid_mapq() {
let file = write_sam(
"read1\t0\tchr1\t100\t999\t50M\t*\t0\t0\tACGT\t*\n",
);
let result = parse_sam(file.path());
assert!(result.is_err());
let err_msg = format!("{}", result.unwrap_err());
assert!(err_msg.contains("invalid MAPQ"));
}
#[test]
fn test_sam_file_not_found() {
let result = parse_sam("/nonexistent/file.sam");
assert!(result.is_err());
}
#[test]
fn test_sam_with_optional_fields() {
let file = write_sam(
"read1\t0\tchr1\t100\t60\t50M\t*\t0\t0\tACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC\t*\tNM:i:2\tMD:Z:48A1\n",
);
let records = parse_sam(file.path()).unwrap();
assert_eq!(records.len(), 1);
assert_eq!(records[0].qname, "read1");
assert_eq!(records[0].mapq, 60);
}
#[test]
fn test_sam_paired_end_flags() {
let file = write_sam(
"read1\t99\tchr1\t100\t60\t50M\t=\t200\t150\tACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC\t*\n\
read1\t147\tchr1\t200\t60\t50M\t=\t100\t-150\tACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC\t*\n",
);
let records = parse_sam(file.path()).unwrap();
assert_eq!(records.len(), 2);
assert!(records[0].is_mapped());
assert!(records[1].is_mapped());
assert_eq!(records[0].flag, 99);
assert_eq!(records[1].flag, 147);
}
#[test]
fn test_sam_seq_len_star() {
let file = write_sam(
"read1\t4\t*\t0\t0\t*\t*\t0\t0\t*\t*\n",
);
let records = parse_sam(file.path()).unwrap();
assert_eq!(records.len(), 1);
assert_eq!(records[0].seq_len(), 0);
assert_eq!(records[0].sequence, "*");
}
#[test]
fn test_sam_mapq_255() {
let file = write_sam(
"read1\t0\tchr1\t100\t255\t50M\t*\t0\t0\tACGT\t*\n",
);
let records = parse_sam(file.path()).unwrap();
assert_eq!(records[0].mapq, 255);
}
#[test]
fn test_sam_skip_blank_lines() {
let file = write_sam(
"\n\
@HD\tVN:1.6\n\
\n\
read1\t0\tchr1\t100\t60\t10M\t*\t0\t0\tACGTACGTAC\t*\n\
\n\
read2\t0\tchr1\t200\t40\t10M\t*\t0\t0\tGCTAGCTAGC\t*\n\
\n",
);
let records = parse_sam(file.path()).unwrap();
assert_eq!(records.len(), 2);
}
#[test]
fn test_sam_quality_string() {
let file = write_sam(
"read1\t0\tchr1\t100\t60\t4M\t*\t0\t0\tACGT\tIIII\n",
);
let records = parse_sam(file.path()).unwrap();
assert_eq!(records[0].quality, "IIII");
}
#[test]
fn test_sam_stats_mapq_distribution_sorted() {
let file = write_sam(
"r1\t0\tchr1\t100\t10\t5M\t*\t0\t0\tACGTA\t*\n\
r2\t0\tchr1\t200\t60\t5M\t*\t0\t0\tACGTA\t*\n\
r3\t0\tchr1\t300\t10\t5M\t*\t0\t0\tACGTA\t*\n\
r4\t0\tchr1\t400\t30\t5M\t*\t0\t0\tACGTA\t*\n\
r5\t0\tchr1\t500\t60\t5M\t*\t0\t0\tACGTA\t*\n",
);
let records = parse_sam(file.path()).unwrap();
let stats = sam_stats(&records);
assert_eq!(stats.total_reads, 5);
assert_eq!(stats.mapped, 5);
assert_eq!(stats.mapq_distribution.len(), 3);
assert_eq!(stats.mapq_distribution[0], (10, 2));
assert_eq!(stats.mapq_distribution[1], (30, 1));
assert_eq!(stats.mapq_distribution[2], (60, 2));
assert!((stats.avg_mapq - 34.0).abs() < f64::EPSILON);
}
#[test]
fn test_parse_rnext_pnext_tlen() {
let file = write_sam(
"read1\t99\tchr1\t100\t60\t50M\t=\t200\t150\tACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC\t*\n\
read1\t147\tchr1\t200\t60\t50M\t=\t100\t-150\tACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC\t*\n",
);
let records = parse_sam(file.path()).unwrap();
assert_eq!(records[0].rnext, "=");
assert_eq!(records[0].pnext, 200);
assert_eq!(records[0].tlen, 150);
assert_eq!(records[1].rnext, "=");
assert_eq!(records[1].pnext, 100);
assert_eq!(records[1].tlen, -150);
}
#[test]
fn test_rnext_different_chrom() {
let file = write_sam(
"read1\t99\tchr1\t100\t60\t10M\tchr2\t500\t0\tACGTACGTAC\t*\n",
);
let records = parse_sam(file.path()).unwrap();
assert_eq!(records[0].rnext, "chr2");
assert_eq!(records[0].pnext, 500);
assert_eq!(records[0].tlen, 0);
}
#[test]
fn test_rnext_star_unmapped_mate() {
let file = write_sam(
"read1\t0\tchr1\t100\t60\t10M\t*\t0\t0\tACGTACGTAC\t*\n",
);
let records = parse_sam(file.path()).unwrap();
assert_eq!(records[0].rnext, "*");
assert_eq!(records[0].pnext, 0);
assert_eq!(records[0].tlen, 0);
}
#[test]
fn test_flag_helpers() {
let file = write_sam(
"read1\t99\tchr1\t100\t60\t10M\t=\t200\t150\tACGTACGTAC\t*\n",
);
let records = parse_sam(file.path()).unwrap();
let r = &records[0];
assert!(r.is_paired());
assert!(r.is_proper_pair());
assert!(!r.is_unmapped());
assert!(!r.is_mate_unmapped());
assert!(!r.is_reverse());
assert!(r.is_mate_reverse());
assert!(r.is_first_in_pair());
assert!(!r.is_second_in_pair());
assert!(!r.is_secondary());
assert!(!r.is_supplementary());
}
#[test]
fn test_flag_second_in_pair() {
let file = write_sam(
"read1\t147\tchr1\t200\t60\t10M\t=\t100\t-150\tACGTACGTAC\t*\n",
);
let records = parse_sam(file.path()).unwrap();
let r = &records[0];
assert!(r.is_paired());
assert!(r.is_proper_pair());
assert!(r.is_reverse());
assert!(r.is_second_in_pair());
assert!(!r.is_first_in_pair());
}
#[test]
fn test_flag_secondary_supplementary() {
let file = write_sam(
"read1\t256\tchr1\t100\t0\t10M\t*\t0\t0\tACGTACGTAC\t*\n\
read2\t2048\tchr1\t200\t0\t10M\t*\t0\t0\tACGTACGTAC\t*\n",
);
let records = parse_sam(file.path()).unwrap();
assert!(records[0].is_secondary());
assert!(!records[0].is_supplementary());
assert!(!records[1].is_secondary());
assert!(records[1].is_supplementary());
}
#[test]
fn test_pair_sam_records_basic() {
let file = write_sam(
"read1\t99\tchr1\t100\t60\t50M\t=\t200\t150\tACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC\t*\n\
read1\t147\tchr1\t200\t60\t50M\t=\t100\t-150\tACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC\t*\n\
read2\t99\tchr1\t300\t60\t50M\t=\t400\t150\tACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC\t*\n\
read2\t147\tchr1\t400\t60\t50M\t=\t300\t-150\tACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC\t*\n",
);
let records = parse_sam(file.path()).unwrap();
let pairs = pair_sam_records(&records);
assert_eq!(pairs.len(), 2);
assert_eq!(pairs[0].r1.qname, "read1");
assert_eq!(pairs[0].r2.qname, "read1");
assert_eq!(pairs[0].insert_size(), 150);
assert_eq!(pairs[1].r1.qname, "read2");
}
#[test]
fn test_pair_sam_records_singletons() {
let file = write_sam(
"read1\t99\tchr1\t100\t60\t10M\t=\t200\t150\tACGTACGTAC\t*\n",
);
let records = parse_sam(file.path()).unwrap();
let pairs = pair_sam_records(&records);
assert!(pairs.is_empty());
}
#[test]
fn test_pair_sam_records_excludes_secondary() {
let file = write_sam(
"read1\t99\tchr1\t100\t60\t10M\t=\t200\t150\tACGTACGTAC\t*\n\
read1\t147\tchr1\t200\t60\t10M\t=\t100\t-150\tACGTACGTAC\t*\n\
read1\t355\tchr2\t500\t0\t10M\t=\t200\t0\tACGTACGTAC\t*\n",
);
let records = parse_sam(file.path()).unwrap();
let pairs = pair_sam_records(&records);
assert_eq!(pairs.len(), 1);
}
#[test]
fn test_filter_proper_pairs() {
let file = write_sam(
"read1\t99\tchr1\t100\t60\t10M\t=\t200\t150\tACGTACGTAC\t*\n\
read2\t0\tchr1\t300\t60\t10M\t*\t0\t0\tACGTACGTAC\t*\n",
);
let records = parse_sam(file.path()).unwrap();
let proper = filter_proper_pairs(&records);
assert_eq!(proper.len(), 1);
assert_eq!(proper[0].qname, "read1");
}
#[test]
fn test_paired_sam_stats() {
let file = write_sam(
"read1\t99\tchr1\t100\t60\t50M\t=\t200\t150\tACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC\t*\n\
read1\t147\tchr1\t200\t60\t50M\t=\t100\t-150\tACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC\t*\n\
read2\t99\tchr1\t300\t60\t50M\t=\t500\t250\tACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC\t*\n\
read2\t147\tchr1\t500\t60\t50M\t=\t300\t-250\tACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC\t*\n",
);
let records = parse_sam(file.path()).unwrap();
let stats = paired_sam_stats(&records);
assert_eq!(stats.base.total_reads, 4);
assert_eq!(stats.base.mapped, 4);
assert_eq!(stats.paired_count, 4);
assert_eq!(stats.proper_pair_count, 4);
assert_eq!(stats.singletons, 0);
assert!((stats.avg_insert_size - 200.0).abs() < f64::EPSILON);
}
#[test]
fn test_paired_sam_stats_singleton() {
let file = write_sam(
"read1\t73\tchr1\t100\t60\t10M\t*\t0\t0\tACGTACGTAC\t*\n",
);
let records = parse_sam(file.path()).unwrap();
let stats = paired_sam_stats(&records);
assert_eq!(stats.paired_count, 1);
assert_eq!(stats.singletons, 1);
assert_eq!(stats.proper_pair_count, 0);
assert!((stats.avg_insert_size - 0.0).abs() < f64::EPSILON);
}
}