use crate::core::{GenomicReader, GenomicRecordIterator};
use crate::error::{Error, Result};
use crate::formats::bam::{CigarOp, RefSeq, TagValue};
use crate::io::Compression;
use flate2::read::MultiGzDecoder;
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
#[derive(Debug, Clone)]
pub struct SamHeader {
pub lines: Vec<String>,
pub references: Vec<RefSeq>,
ref_index: HashMap<String, usize>,
}
impl SamHeader {
pub fn new() -> Self {
Self {
lines: Vec::new(),
references: Vec::new(),
ref_index: HashMap::new(),
}
}
pub fn ref_index(&self, name: &str) -> Option<usize> {
self.ref_index.get(name).copied()
}
pub fn ref_name(&self, idx: usize) -> Option<&str> {
self.references.get(idx).map(|r| r.name.as_str())
}
}
impl Default for SamHeader {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct SamRecord {
pub qname: String,
pub flag: u16,
pub rname: String,
pub pos: i32,
pub mapq: u8,
pub cigar: Vec<CigarOp>,
pub rnext: String,
pub pnext: i32,
pub tlen: i32,
pub seq: Vec<u8>,
pub qual: Vec<u8>,
pub tags: HashMap<String, TagValue>,
}
impl SamRecord {
pub fn is_paired(&self) -> bool {
(self.flag & 0x1) != 0
}
pub fn is_proper_pair(&self) -> bool {
(self.flag & 0x2) != 0
}
pub fn is_unmapped(&self) -> bool {
(self.flag & 0x4) != 0
}
pub fn is_mate_unmapped(&self) -> bool {
(self.flag & 0x8) != 0
}
pub fn is_reverse(&self) -> bool {
(self.flag & 0x10) != 0
}
pub fn is_mate_reverse(&self) -> bool {
(self.flag & 0x20) != 0
}
pub fn is_first_in_pair(&self) -> bool {
(self.flag & 0x40) != 0
}
pub fn is_second_in_pair(&self) -> bool {
(self.flag & 0x80) != 0
}
pub fn is_secondary(&self) -> bool {
(self.flag & 0x100) != 0
}
pub fn is_qc_fail(&self) -> bool {
(self.flag & 0x200) != 0
}
pub fn is_duplicate(&self) -> bool {
(self.flag & 0x400) != 0
}
pub fn is_supplementary(&self) -> bool {
(self.flag & 0x800) != 0
}
pub fn cigar_string(&self) -> String {
if self.cigar.is_empty() {
"*".to_string()
} else {
self.cigar.iter().map(|op| op.to_string()).collect()
}
}
pub fn seq_string(&self) -> String {
if self.seq.is_empty() {
"*".to_string()
} else {
String::from_utf8_lossy(&self.seq).to_string()
}
}
}
pub struct SamReader {
reader: Box<dyn BufRead>,
header: Option<SamHeader>,
line_buffer: String,
line_number: usize,
}
impl SamReader {
pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
let path = path.as_ref();
let file = File::open(path)?;
let compression = Compression::from_path(path);
let reader: Box<dyn BufRead> = match compression {
Compression::Gzip => Box::new(BufReader::new(MultiGzDecoder::new(file))),
_ => Box::new(BufReader::new(file)),
};
Ok(Self {
reader,
header: None,
line_buffer: String::with_capacity(512),
line_number: 0,
})
}
pub fn new<R: BufRead + 'static>(reader: R) -> Self {
Self {
reader: Box::new(reader),
header: None,
line_buffer: String::with_capacity(512),
line_number: 0,
}
}
pub fn read_header(&mut self) -> Result<&SamHeader> {
if self.header.is_some() {
return Ok(self.header.as_ref().unwrap());
}
let mut header = SamHeader::new();
loop {
self.line_buffer.clear();
let bytes = self.reader.read_line(&mut self.line_buffer)?;
if bytes == 0 {
break;
}
self.line_number += 1;
let line = self.line_buffer.trim();
if line.is_empty() {
continue;
}
if !line.starts_with('@') {
self.line_number -= 1;
break;
}
header.lines.push(line.to_string());
if line.starts_with("@SQ") {
let mut name = None;
let mut length = None;
for field in line.split('\t').skip(1) {
if let Some((key, value)) = field.split_once(':') {
match key {
"SN" => name = Some(value.to_string()),
"LN" => length = value.parse::<u32>().ok(),
_ => {}
}
}
}
if let (Some(name), Some(length)) = (name, length) {
let idx = header.references.len();
header.ref_index.insert(name.clone(), idx);
header.references.push(RefSeq { name, length });
}
}
}
self.header = Some(header);
Ok(self.header.as_ref().unwrap())
}
pub fn header(&self) -> Option<&SamHeader> {
self.header.as_ref()
}
fn parse_cigar(cigar_str: &str) -> Result<Vec<CigarOp>> {
CigarOp::parse_cigar_string(cigar_str)
}
fn parse_record(&mut self, line: &str) -> Result<SamRecord> {
let fields: Vec<&str> = line.split('\t').collect();
if fields.len() < 11 {
return Err(Error::Parse(format!(
"Line {}: SAM record must have at least 11 fields, got {}",
self.line_number,
fields.len()
)));
}
let qname = fields[0].to_string();
let flag = fields[1].parse::<u16>().map_err(|_| {
Error::Parse(format!("Line {}: Invalid FLAG: {}", self.line_number, fields[1]))
})?;
let rname = fields[2].to_string();
let pos = fields[3].parse::<i32>().map_err(|_| {
Error::Parse(format!("Line {}: Invalid POS: {}", self.line_number, fields[3]))
})?;
let mapq = fields[4].parse::<u8>().map_err(|_| {
Error::Parse(format!("Line {}: Invalid MAPQ: {}", self.line_number, fields[4]))
})?;
let cigar = Self::parse_cigar(fields[5])?;
let rnext = fields[6].to_string();
let pnext = fields[7].parse::<i32>().map_err(|_| {
Error::Parse(format!("Line {}: Invalid PNEXT: {}", self.line_number, fields[7]))
})?;
let tlen = fields[8].parse::<i32>().map_err(|_| {
Error::Parse(format!("Line {}: Invalid TLEN: {}", self.line_number, fields[8]))
})?;
let seq = if fields[9] == "*" {
Vec::new()
} else {
fields[9].as_bytes().to_vec()
};
let qual = if fields[10] == "*" {
Vec::new()
} else {
fields[10].bytes().map(|b| b.saturating_sub(33)).collect()
};
let mut tags = HashMap::new();
for field in fields.iter().skip(11) {
if let Some((tag, rest)) = field.split_once(':') {
if let Some((type_str, value)) = rest.split_once(':') {
let tag_value = match type_str {
"A" => {
TagValue::Char(value.bytes().next().unwrap_or(0))
}
"i" => {
TagValue::Int(value.parse().unwrap_or(0))
}
"f" => {
TagValue::Float(value.parse().unwrap_or(0.0))
}
"Z" => {
TagValue::String(value.to_string())
}
"H" => {
TagValue::String(value.to_string())
}
"B" => {
TagValue::ByteArray(value.as_bytes().to_vec())
}
_ => continue,
};
tags.insert(tag.to_string(), tag_value);
}
}
}
Ok(SamRecord {
qname,
flag,
rname,
pos,
mapq,
cigar,
rnext,
pnext,
tlen,
seq,
qual,
tags,
})
}
pub fn compute_stats(
mut self,
_config: Option<crate::parallel::ParallelConfig>,
) -> Result<super::stats::SamStats> {
self.read_header()?;
super::stats::SamStats::compute(&mut self)
}
}
impl GenomicRecordIterator for SamReader {
type Record = SamRecord;
fn next_raw(&mut self) -> Result<Option<Vec<u8>>> {
self.line_buffer.clear();
let bytes = self.reader.read_line(&mut self.line_buffer)?;
if bytes == 0 {
return Ok(None);
}
self.line_number += 1;
let line = self.line_buffer.trim();
if line.is_empty() {
return self.next_raw();
}
if line.starts_with('@') {
return self.next_raw();
}
Ok(Some(line.as_bytes().to_vec()))
}
fn next_record(&mut self) -> Result<Option<Self::Record>> {
self.line_buffer.clear();
let bytes = self.reader.read_line(&mut self.line_buffer)?;
if bytes == 0 {
return Ok(None);
}
self.line_number += 1;
let line = self.line_buffer.trim().to_string();
if line.is_empty() {
return self.next_record();
}
if line.starts_with('@') {
return self.next_record();
}
Ok(Some(self.parse_record(&line)?))
}
}
impl GenomicReader for SamReader {
type Metadata = SamHeader;
fn metadata(&self) -> &Self::Metadata {
self.header
.as_ref()
.expect("Header not read yet. Call read_header() first.")
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cigar_parsing() {
let cigar = SamReader::parse_cigar("10M2I5D").unwrap();
assert_eq!(cigar.len(), 3);
assert_eq!(cigar[0], CigarOp::Match(10));
assert_eq!(cigar[1], CigarOp::Ins(2));
assert_eq!(cigar[2], CigarOp::Del(5));
}
#[test]
fn test_cigar_star() {
let cigar = SamReader::parse_cigar("*").unwrap();
assert!(cigar.is_empty());
}
#[test]
fn test_sam_record_flags() {
let record = SamRecord {
qname: "read1".to_string(),
flag: 0x1 | 0x2, rname: "chr1".to_string(),
pos: 100,
mapq: 60,
cigar: vec![CigarOp::Match(100)],
rnext: "=".to_string(),
pnext: 200,
tlen: 150,
seq: b"ACGT".to_vec(),
qual: vec![30, 30, 30, 30],
tags: HashMap::new(),
};
assert!(record.is_paired());
assert!(record.is_proper_pair());
assert!(!record.is_unmapped());
assert!(!record.is_duplicate());
}
}