use std::io::{self, Read, Write};
use std::io::ErrorKind::{self, InvalidData, UnexpectedEof};
use std::cell::Cell;
use std::str::from_utf8;
use std::fmt;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
pub mod tags;
pub mod cigar;
pub mod sequence;
use super::header::Header;
use super::index;
pub use cigar::Cigar;
pub use sequence::Sequence;
pub use sequence::Qualities;
pub const RECORD_PAIRED: u16 = 0x1;
pub const ALL_SEGMENTS_ALIGNED: u16 = 0x2;
pub const RECORD_UNMAPPED: u16 = 0x4;
pub const MATE_UNMAPPED: u16 = 0x8;
pub const RECORD_REVERSE_STRAND: u16 = 0x10;
pub const MATE_REVERSE_STRAND: u16 = 0x20;
pub const FIRST_IN_PAIR: u16 = 0x40;
pub const LAST_IN_PAIR: u16 = 0x80;
pub const SECONDARY: u16 = 0x100;
pub const RECORD_FAILS_QC: u16 = 0x200;
pub const PCR_OR_OPTICAL_DUPLICATE: u16 = 0x400;
pub const SUPPLEMENTARY: u16 = 0x800;
#[derive(Clone, Copy, Debug)]
pub struct Flag(pub u16);
impl Flag {
pub fn all_bits(&self, mask: u16) -> bool {
(self.0 & mask) == mask
}
pub fn any_bit(&self, mask: u16) -> bool {
(self.0 & mask) != 0
}
pub fn no_bits(&self, mask: u16) -> bool {
(self.0 & mask) == 0
}
pub fn is_paired(&self) -> bool {
self.any_bit(RECORD_PAIRED)
}
pub fn all_segments_aligned(&self) -> bool {
self.any_bit(ALL_SEGMENTS_ALIGNED)
}
pub fn is_mapped(&self) -> bool {
self.no_bits(RECORD_UNMAPPED)
}
pub fn mate_is_mapped(&self) -> bool {
self.no_bits(MATE_UNMAPPED)
}
pub fn is_reverse_strand(&self) -> bool {
self.any_bit(RECORD_REVERSE_STRAND)
}
pub fn mate_is_reverse_strand(&self) -> bool {
self.any_bit(MATE_REVERSE_STRAND)
}
pub fn first_in_pair(&self) -> bool {
self.any_bit(FIRST_IN_PAIR)
}
pub fn last_in_pair(&self) -> bool {
self.any_bit(LAST_IN_PAIR)
}
pub fn is_secondary(&self) -> bool {
self.any_bit(SECONDARY)
}
pub fn fails_quality_controls(&self) -> bool {
self.any_bit(RECORD_FAILS_QC)
}
pub fn is_duplicate(&self) -> bool {
self.any_bit(PCR_OR_OPTICAL_DUPLICATE)
}
pub fn is_supplementary(&self) -> bool {
self.any_bit(SUPPLEMENTARY)
}
pub fn set_paired(&mut self, paired: bool) {
if paired {
self.0 |= RECORD_PAIRED;
} else {
self.0 &= !RECORD_PAIRED;
}
}
pub fn set_all_segments_aligned(&mut self, all_segments_aligned: bool) {
if all_segments_aligned {
self.0 |= ALL_SEGMENTS_ALIGNED;
} else {
self.0 &= !ALL_SEGMENTS_ALIGNED;
}
}
pub fn set_mapped(&mut self, mapped: bool) {
if mapped {
self.0 &= !RECORD_UNMAPPED;
} else {
self.0 |= RECORD_UNMAPPED;
}
}
pub fn set_mate_mapped(&mut self, mate_mapped: bool) {
if mate_mapped {
self.0 &= !MATE_UNMAPPED;
} else {
self.0 |= MATE_UNMAPPED;
}
}
pub fn set_strand(&mut self, forward_strand: bool) {
if forward_strand {
self.0 &= !RECORD_REVERSE_STRAND;
} else {
self.0 |= RECORD_REVERSE_STRAND;
}
}
pub fn set_mate_strand(&mut self, forward_strand: bool) {
if forward_strand {
self.0 &= !MATE_REVERSE_STRAND;
} else {
self.0 |= MATE_REVERSE_STRAND;
}
}
pub fn set_first_in_pair(&mut self, is_first: bool) {
if is_first {
self.0 |= FIRST_IN_PAIR;
} else {
self.0 &= !FIRST_IN_PAIR;
}
}
pub fn set_last_in_pair(&mut self, is_last: bool) {
if is_last {
self.0 |= LAST_IN_PAIR;
} else {
self.0 &= !LAST_IN_PAIR;
}
}
pub fn set_secondary(&mut self, is_secondary: bool) {
if is_secondary {
self.0 |= SECONDARY;
} else {
self.0 &= !SECONDARY;
}
}
pub fn make_fail_quality_controls(&mut self, fails: bool) {
if fails {
self.0 |= RECORD_FAILS_QC;
} else {
self.0 &= !RECORD_FAILS_QC;
}
}
pub fn set_duplicate(&mut self, is_duplicate: bool) {
if is_duplicate {
self.0 |= PCR_OR_OPTICAL_DUPLICATE;
} else {
self.0 &= !PCR_OR_OPTICAL_DUPLICATE;
}
}
pub fn set_supplementary(&mut self, is_supplementary: bool) {
if is_supplementary {
self.0 |= SUPPLEMENTARY;
} else {
self.0 &= !SUPPLEMENTARY;
}
}
}
impl fmt::Display for Flag {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "Flag({})", self.0)
}
}
pub(crate) fn resize<T: Default + Copy>(v: &mut Vec<T>, new_len: usize) {
unsafe {
if v.capacity() < new_len {
v.reserve(new_len);
v.set_len(v.capacity());
let new_val = T::default();
v.iter_mut().map(|x| *x = new_val).count();
}
v.set_len(new_len);
}
}
pub(crate) fn write_iterator<W, I>(writer: &mut W, mut iterator: I) -> io::Result<()>
where W: Write,
I: Iterator<Item = u8>,
{
const SIZE: usize = 1024;
let mut buffer = [0_u8; SIZE];
loop {
for i in 0..SIZE {
match iterator.next() {
Some(value) => buffer[i] = value,
None => {
return writer.write_all(&buffer[..i]);
}
}
}
writer.write_all(&buffer)?;
}
}
trait NextToErr<'a> {
fn try_next(&mut self, field: &'static str) -> io::Result<&'a str>;
}
impl<'a> NextToErr<'a> for std::str::Split<'a, char> {
fn try_next(&mut self, field: &'static str) -> io::Result<&'a str> {
self.next().ok_or_else(|| io::Error::new(UnexpectedEof,
format!("Truncated file: Cannot extract field {}", field)))
}
}
#[derive(Clone)]
pub struct Record {
ref_id: i32,
mate_ref_id: i32,
start: i32,
end: Cell<i32>,
mate_start: i32,
bin: Cell<u16>,
mapq: u8,
flag: Flag,
template_len: i32,
name: Vec<u8>,
cigar: Cigar,
seq: Sequence,
qual: Qualities,
tags: tags::TagViewer,
}
const BIN_UNKNOWN: u16 = std::u16::MAX;
impl Record {
pub fn new() -> Record {
Record {
ref_id: -1,
mate_ref_id: -1,
start: -1,
end: Cell::new(0),
mate_start: -1,
bin: Cell::new(BIN_UNKNOWN),
mapq: 0,
flag: Flag(0),
template_len: 0,
name: Vec::new(),
cigar: Cigar::new(),
seq: Sequence::new(),
qual: Qualities::new(),
tags: tags::TagViewer::new(),
}
}
pub fn clear(&mut self) {
self.ref_id = -1;
self.mate_ref_id = -1;
self.start = -1;
self.end.set(0);
self.mate_start = -1;
self.bin.set(BIN_UNKNOWN);
self.mapq = 0;
self.flag.0 = 0;
self.template_len = 0;
self.name.clear();
self.cigar.clear();
self.seq.clear();
self.qual.clear();
self.tags.clear();
}
fn corrupt(&mut self, text: &str) -> io::Error {
if self.name.is_empty() {
io::Error::new(InvalidData,
format!("Corrupted record {}: {}",
std::str::from_utf8(&self.name).unwrap_or("*NAME NOT UTF-8*"), text))
} else {
io::Error::new(InvalidData, format!("Corrupted record: {}", text))
}
}
pub fn fill_from_bam<R: Read>(&mut self, stream: &mut R) -> io::Result<bool> {
self.name.clear();
let block_size = match stream.read_i32::<LittleEndian>() {
Ok(value) => {
if value < 0 {
return Err(self.corrupt("Negative block size"));
}
value as usize
},
Err(e) => {
if e.kind() == ErrorKind::UnexpectedEof {
return Ok(false);
} else {
return Err(e);
}
},
};
let ref_id = stream.read_i32::<LittleEndian>()?;
if ref_id < -1 {
return Err(self.corrupt("Reference id < 1"));
}
self.set_ref_id(ref_id);
self.end.set(0);
let start = stream.read_i32::<LittleEndian>()?;
if start < -1 {
return Err(self.corrupt("Start < 1"));
}
self.set_start(start);
let name_len = stream.read_u8()?;
if name_len == 0 {
return Err(self.corrupt("Name length == 0"));
}
self.set_mapq(stream.read_u8()?);
self.bin.set(stream.read_u16::<LittleEndian>()?);
let cigar_len = stream.read_u16::<LittleEndian>()?;
self.set_flag(stream.read_u16::<LittleEndian>()?);
let qual_len = stream.read_i32::<LittleEndian>()?;
if qual_len < 0 {
return Err(self.corrupt("Negative sequence length"));
}
let qual_len = qual_len as usize;
if self.flag().is_mapped() && cigar_len == 0 {
return Err(self.corrupt("Mapped read has an empty CIGAR"));
}
let mate_ref_id = stream.read_i32::<LittleEndian>()?;
if mate_ref_id < -1 {
return Err(self.corrupt("Mate reference id < 1"));
}
self.set_mate_ref_id(mate_ref_id);
let mate_start = stream.read_i32::<LittleEndian>()?;
if mate_start < -1 {
return Err(self.corrupt("Mate start < 1"));
}
self.set_mate_start(mate_start);
self.set_template_len(stream.read_i32::<LittleEndian>()?);
resize(&mut self.name, name_len as usize - 1);
stream.read_exact(&mut self.name)?;
let _null_symbol = stream.read_u8()?;
self.cigar.fill_from(stream, cigar_len as usize)?;
self.seq.fill_from(stream, qual_len)?;
self.qual.fill_from(stream, qual_len)?;
let seq_len = (qual_len + 1) / 2;
let remaining_size = block_size - 32 - name_len as usize - 4 * cigar_len as usize
- seq_len - qual_len;
self.tags.fill_from(stream, remaining_size)?;
self.replace_cigar_if_needed()?;
Ok(true)
}
pub fn fill_from_sam(&mut self, line: &str, header: &Header) -> io::Result<()> {
let mut split = line.split('\t');
self.set_name(split.try_next("record name (QNAME)")?.bytes());
let flag = split.try_next("flag")?;
let flag = flag.parse()
.map_err(|_| self.corrupt(&format!("Cannot convert flag '{}' to int", flag)))?;
self.set_flag(flag);
let rname = split.try_next("reference name")?;
if rname == "*" {
self.set_ref_id(-1);
} else {
let r_id = header.reference_id(rname).ok_or_else(||
self.corrupt(&format!("Reference '{}' is not in the header", rname)))?;
self.set_ref_id(r_id as i32);
}
let start = split.try_next("start (POS)")?;
let start = start.parse::<i32>()
.map_err(|_| self.corrupt(&format!("Cannot convert POS '{}' to int", start)))? - 1;
if start < -1 {
return Err(self.corrupt("Start < -1"));
}
self.set_start(start);
let mapq = split.try_next("mapq")?;
let mapq = mapq.parse()
.map_err(|_| self.corrupt(&format!("Cannot convert MAPQ '{}' to int", mapq)))?;
self.set_mapq(mapq);
let cigar = split.try_next("CIGAR")?;
if cigar == "*" {
self.set_raw_cigar(std::iter::empty());
} else {
self.set_cigar(cigar.bytes()).map_err(|e| self.corrupt(&e))?;
}
if self.flag().is_mapped() && self.cigar.len() == 0 {
return Err(self.corrupt("Mapped read has an empty CIGAR"));
}
let rnext = split.try_next("mate reference name (RNEXT)")?;
if rnext == "*" {
self.set_mate_ref_id(-1);
} else if rnext == "=" {
self.set_mate_ref_id(self.ref_id);
} else {
let nr_id = header.reference_id(rnext).ok_or_else(||
self.corrupt(&format!("Reference '{}' is not in the header", rnext)))?;
self.set_mate_ref_id(nr_id as i32);
}
let mate_start = split.try_next("mate start (PNEXT)")?;
let mate_start = mate_start.parse::<i32>().map_err(|_|
self.corrupt(&format!("Cannot convert PNEXT '{}' to int", mate_start)))? - 1;
if mate_start < -1 {
return Err(self.corrupt("Mate start < -1"));
}
self.set_mate_start(mate_start);
let template_len = split.try_next("template length (TLEN)")?;
let template_len = template_len.parse::<i32>().map_err(|_|
self.corrupt(&format!("Cannot convert TLEN '{}' to int", template_len)))?;
self.set_template_len(template_len);
let seq = split.try_next("sequence")?;
let qual = split.try_next("qualities")?;
if seq == "*" {
self.set_seq_qual(std::iter::empty(), std::iter::empty()).map_err(|e| self.corrupt(&e))?;
} else if qual == "*" {
self.set_seq_qual(seq.bytes(), std::iter::empty()).map_err(|e| self.corrupt(&e))?;
} else {
self.set_seq_qual(seq.bytes(), qual.bytes().map(|q| q - 33)).map_err(|e| self.corrupt(&e))?;
}
self.tags.clear();
for tag in split {
self.tags.push_sam(tag)?;
}
Ok(())
}
fn replace_cigar_if_needed(&mut self) -> io::Result<()> {
if !self.cigar.is_empty() && self.cigar.at(0) ==
(self.seq.len() as u32, cigar::Operation::Soft) {
if self.cigar.len() != 2 {
return Err(self.corrupt("Record contains invalid Cigar"));
}
let (len, op) = self.cigar.at(1);
if op != cigar::Operation::Skip {
return Err(self.corrupt("Record contains invalid Cigar"));
}
self.end.set(self.start + len as i32);
let cigar_arr = match self.tags.get(b"CG") {
Some(tags::TagValue::IntArray(array_view)) => {
if array_view.int_type() != tags::IntegerType::U32 {
return Err(self.corrupt("CG tag has an incorrect type"));
}
array_view
},
_ => return Err(
self.corrupt("Record should contain tag CG, but does not")),
};
self.cigar.clear();
self.cigar.extend_from_raw(cigar_arr.iter().map(|el| el as u32));
std::mem::drop(cigar_arr);
self.tags.remove(b"CG");
}
Ok(())
}
pub fn shrink_to_fit(&mut self) {
self.name.shrink_to_fit();
self.cigar.shrink_to_fit();
self.seq.shrink_to_fit();
self.qual.shrink_to_fit();
self.tags.shrink_to_fit();
}
pub fn name(&self) -> &[u8] {
&self.name
}
pub fn sequence(&self) -> &Sequence {
&self.seq
}
pub fn qualities(&self) -> &Qualities {
&self.qual
}
pub fn cigar(&self) -> &Cigar {
&self.cigar
}
pub fn ref_id(&self) -> i32 {
self.ref_id
}
pub fn start(&self) -> i32 {
self.start
}
pub fn calculate_end(&self) -> i32 {
if self.cigar.is_empty() {
return 0;
}
let end = self.end.get();
if end != 0 {
return end;
}
let end = self.start + self.cigar.calculate_ref_len() as i32;
self.end.set(end);
end
}
pub fn query_len(&self) -> u32 {
if self.seq.available() {
self.seq.len() as u32
} else {
self.cigar.calculate_query_len()
}
}
pub fn aligned_query_start(&self) -> u32 {
if self.flag.is_mapped() {
self.cigar.soft_clipping(true)
} else {
self.query_len()
}
}
pub fn aligned_query_end(&self) -> u32 {
if self.flag.is_mapped() {
self.query_len() - self.cigar.soft_clipping(false)
} else {
0
}
}
pub fn calculate_bin(&self) -> u16 {
let bin = self.bin.get();
if bin != BIN_UNKNOWN {
return bin;
}
let end = self.calculate_end();
let bin = index::region_to_bin(self.start, end) as u16;
self.bin.set(bin);
bin
}
pub fn mapq(&self) -> u8 {
self.mapq
}
pub fn mate_ref_id(&self) -> i32 {
self.mate_ref_id
}
pub fn mate_start(&self) -> i32 {
self.mate_start
}
pub fn template_len(&self) -> i32 {
self.template_len
}
pub fn tags(&self) -> &tags::TagViewer {
&self.tags
}
pub fn tags_mut(&mut self) -> &mut tags::TagViewer {
&mut self.tags
}
pub fn write_sam<W: Write>(&self, f: &mut W, header: &Header) -> io::Result<()> {
f.write_all(&self.name)?;
write!(f, "\t{}\t", self.flag.0)?;
if self.ref_id < 0 {
f.write_all(b"*\t")?;
} else {
write!(f, "{}\t", header.reference_name(self.ref_id as u32)
.ok_or_else(|| io::Error::new(InvalidData, "Record has a reference id not in the header"))?)?;
}
write!(f, "{}\t{}\t", self.start + 1, self.mapq)?;
self.cigar.write_readable(f)?;
if self.mate_ref_id < 0 {
f.write_all(b"\t*\t")?;
} else if self.mate_ref_id == self.ref_id {
f.write_all(b"\t=\t")?;
} else {
write!(f, "\t{}\t", header.reference_name(self.mate_ref_id as u32)
.ok_or_else(|| io::Error::new(InvalidData, "Record has a reference id not in the header"))?)?;
}
write!(f, "{}\t{}\t", self.mate_start + 1, self.template_len)?;
self.seq.write_readable(f)?;
f.write_u8(b'\t')?;
self.qual.write_readable(f)?;
self.tags.write_sam(f)?;
writeln!(f)
}
pub fn write_bam<W: Write>(&self, stream: &mut W) -> io::Result<()> {
let raw_cigar_len = if self.cigar.len() <= 0xffff {
4 * self.cigar.len()
} else {
16 + 4 * self.cigar.len()
};
let total_block_len = 32 + self.name.len() + 1 + raw_cigar_len
+ self.seq.raw().len() + self.seq.len() + self.tags.raw().len();
stream.write_i32::<LittleEndian>(total_block_len as i32)?;
stream.write_i32::<LittleEndian>(self.ref_id)?;
stream.write_i32::<LittleEndian>(self.start)?;
stream.write_u8(self.name.len() as u8 + 1)?;
stream.write_u8(self.mapq)?;
stream.write_u16::<LittleEndian>(self.calculate_bin())?;
if self.cigar.len() <= 0xffff {
stream.write_u16::<LittleEndian>(self.cigar.len() as u16)?;
} else {
stream.write_u16::<LittleEndian>(2)?;
}
stream.write_u16::<LittleEndian>(self.flag.0)?;
stream.write_i32::<LittleEndian>(self.seq.len() as i32)?;
stream.write_i32::<LittleEndian>(self.mate_ref_id)?;
stream.write_i32::<LittleEndian>(self.mate_start)?;
stream.write_i32::<LittleEndian>(self.template_len)?;
stream.write_all(&self.name)?;
stream.write_u8(0)?;
if self.cigar.len() <= 0xffff {
for &el in self.cigar.raw() {
stream.write_u32::<LittleEndian>(el)?;
}
} else {
let seq_len = if self.seq.available() {
self.seq.len() as u32
} else {
self.cigar.calculate_query_len()
};
stream.write_u32::<LittleEndian>(seq_len << 4 | 4)?;
let ref_len = (self.calculate_end() - self.start) as u32;
stream.write_u32::<LittleEndian>(ref_len << 4 | 3)?;
}
stream.write_all(&self.seq.raw())?;
if self.qual.raw().len() == self.seq.len() {
stream.write_all(&self.qual.raw())?;
} else {
write_iterator(stream, std::iter::repeat(0xff).take(self.seq.len()))?;
}
stream.write_all(self.tags.raw())?;
if self.cigar.len() > 0xffff {
stream.write_all(b"CGBI")?;
stream.write_i32::<LittleEndian>(self.cigar.len() as i32)?;
for &el in self.cigar.raw() {
stream.write_u32::<LittleEndian>(el)?;
}
}
Ok(())
}
pub fn set_name<T: IntoIterator<Item = u8>>(&mut self, name: T) {
self.name.clear();
self.name.extend(name.into_iter().take(254));
}
pub fn flag(&self) -> Flag {
self.flag
}
pub fn flag_mut(&mut self) -> &mut Flag {
&mut self.flag
}
pub fn set_flag(&mut self, flag: u16) {
self.flag.0 = flag;
}
pub fn set_ref_id(&mut self, ref_id: i32) {
assert!(ref_id >= -1, "Reference id < -1");
self.ref_id = ref_id;
}
pub fn set_start(&mut self, start: i32) {
assert!(start >= -1, "Start < -1");
let difference = start - self.start;
self.start = start;
if self.end.get() != 0 {
*self.end.get_mut() += difference;
}
self.bin.set(BIN_UNKNOWN);
}
pub fn set_mapq(&mut self, mapq: u8) {
self.mapq = mapq;
}
pub fn set_mate_ref_id(&mut self, mate_ref_id: i32) {
assert!(mate_ref_id >= -1, "Mate reference id < -1");
self.mate_ref_id = mate_ref_id;
}
pub fn set_mate_start(&mut self, mate_start: i32) {
assert!(mate_start >= -1, "Mate start < -1");
self.mate_start = mate_start;
}
pub fn set_template_len(&mut self, template_len: i32) {
self.template_len = template_len;
}
pub fn set_seq_qual<T, U>(&mut self, sequence: T, qualities: U) -> Result<(), String>
where T: IntoIterator<Item = u8>,
U: IntoIterator<Item = u8>,
{
self.seq.clear();
if let Err(e) = self.seq.extend_from_text(sequence) {
self.seq.clear();
self.qual.clear();
return Err(e);
}
self.qual.clear();
self.qual.extend_from_raw(qualities);
if self.qual.available() && self.seq.len() != self.qual.len() {
let err = Err(format!("Trying to set sequence and qualities of different lengths: \
{} and {}", self.seq.len(), self.qual.len()));
self.seq.clear();
self.qual.clear();
err
} else {
Ok(())
}
}
pub fn set_raw_seq_qual<U>(&mut self, raw_seq: &[u8], qualities: U, len: usize)
-> Result<(), String>
where U: IntoIterator<Item = u8>,
{
self.seq.clear();
self.qual.clear();
self.qual.extend_from_raw(qualities);
if self.qual.available() && self.qual.len() != len {
self.qual.clear();
return Err(format!("Expected qualities length: {}, got {}", len, self.qual.len()));
}
if (len + 1) / 2 != raw_seq.len() {
self.seq.clear();
self.qual.clear();
return Err(format!("Expected raw sequence length: {}, got {}",
(len + 1) / 2, raw_seq.len()));
}
let mut slice = &raw_seq[..];
self.seq.fill_from(&mut slice, len).unwrap();
Ok(())
}
pub fn set_cigar<I: IntoIterator<Item = u8>>(&mut self, cigar: I) -> Result<(), String> {
self.end.set(0);
self.bin.set(BIN_UNKNOWN);
self.cigar.clear();
self.cigar.extend_from_text(cigar)
}
pub fn set_raw_cigar<I: IntoIterator<Item = u32>>(&mut self, cigar: I) {
self.end.set(0);
self.bin.set(BIN_UNKNOWN);
self.cigar.clear();
self.cigar.extend_from_raw(cigar);
}
pub fn aligned_pairs(&self) -> cigar::AlignedPairs {
self.cigar.aligned_pairs(self.start as u32)
}
pub fn matching_pairs(&self) -> cigar::MatchingPairs {
self.cigar.matching_pairs(self.start as u32)
}
pub fn alignment_entries(&self) -> Result<EntriesIter, EntriesError> {
if !self.sequence().available() {
return Err(EntriesError::NoSequence);
}
let md_tag = match self.tags().get(b"MD") {
Some(tags::TagValue::String(tag, _)) => tag,
Some(_) => return Err(EntriesError::IncorrectMD),
None => return Err(EntriesError::NoMD),
};
Ok(EntriesIter {
parent: self,
aligned_pairs: self.cigar.aligned_pairs(self.start as u32),
md_tag,
md_remaining_len: 0,
md_index: 0,
})
}
}
impl fmt::Debug for Record {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "{} (len {}) aligned to [{}]:{}-{}",
std::str::from_utf8(&self.name).unwrap_or("*NAME NOT UTF-8*"), self.query_len(),
self.ref_id(), self.start() + 1, self.calculate_end())
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum EntriesError {
NoSequence,
NoMD,
IncorrectMD,
}
const MISSING: u32 = std::u32::MAX;
#[derive(Clone)]
pub struct AlignmentEntry {
record_pos: u32,
record_nt: u8,
ref_pos: u32,
ref_nt: u8,
}
impl AlignmentEntry {
pub fn record_pos(&self) -> Option<u32> {
if self.record_pos != MISSING {
Some(self.record_pos)
} else {
None
}
}
pub fn record_nt(&self) -> Option<u8> {
if self.record_pos != MISSING {
Some(self.record_nt)
} else {
None
}
}
pub fn record_pos_nt(&self) -> Option<(u32, u8)> {
if self.record_pos != MISSING {
Some((self.record_pos, self.record_nt))
} else {
None
}
}
pub fn ref_pos(&self) -> Option<u32> {
if self.ref_pos != MISSING {
Some(self.ref_pos)
} else {
None
}
}
pub fn ref_nt(&self) -> Option<u8> {
if self.ref_pos != MISSING {
Some(self.ref_nt)
} else {
None
}
}
pub fn ref_pos_nt(&self) -> Option<(u32, u8)> {
if self.ref_pos != MISSING {
Some((self.ref_pos, self.ref_nt))
} else {
None
}
}
pub fn is_insertion(&self) -> bool {
self.ref_pos == MISSING
}
pub fn is_deletion(&self) -> bool {
self.record_pos == MISSING
}
pub fn is_aln_match(&self) -> bool {
self.record_pos != MISSING && self.ref_pos != MISSING
}
pub fn is_seq_match(&self) -> bool {
self.record_nt == self.ref_nt
}
}
impl fmt::Debug for AlignmentEntry {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
if self.record_pos != MISSING {
write!(f, "{}: {}, ", self.record_pos, self.record_nt as char)?;
} else {
write!(f, "- , ")?;
}
if self.ref_pos != MISSING {
write!(f, "{}: {}", self.ref_pos, self.ref_nt as char)
} else {
write!(f, " -")
}
}
}
#[derive(Clone)]
pub struct EntriesIter<'a> {
parent: &'a Record,
aligned_pairs: cigar::AlignedPairs<'a>,
md_tag: &'a [u8],
md_remaining_len: u32,
md_index: usize,
}
impl<'a> EntriesIter<'a> {
fn curr_ref_nt(&mut self, record_nt: Option<u8>) -> u8 {
let start_with_remaining_md = self.md_remaining_len > 0;
while !start_with_remaining_md && self.md_index < self.md_tag.len() {
let curr_char = self.md_tag[self.md_index];
self.md_index += 1;
if curr_char >= b'0' && curr_char <= b'9' {
self.md_remaining_len = self.md_remaining_len * 10 + (curr_char - b'0') as u32;
continue;
}
if self.md_remaining_len > 0 {
self.md_index -= 1;
break;
}
if curr_char == b'^' {
assert!(record_nt.is_none(),
"Record {}: Failed to parse MD tag: {}. MD tag indicates deletion, but CIGAR does not",
from_utf8(self.parent.name()).unwrap_or("*NAME NON UTF-8*"),
from_utf8(self.md_tag).unwrap_or("*MD TAG NON UTF-8*"));
} else {
assert!(curr_char.is_ascii_uppercase(),
"Record {}: Failed to parse MD tag: {}. Unexpected MD char: {}",
from_utf8(self.parent.name()).unwrap_or("*NAME NON UTF-8*"),
from_utf8(self.md_tag).unwrap_or("*MD TAG NON UTF-8*"), curr_char as char);
return curr_char;
}
}
if self.md_remaining_len > 0 {
self.md_remaining_len -= 1;
return record_nt.unwrap_or_else(|| panic!(
"Record {}: Failed to parse MD tag: {}. Reference should match record within deletion",
from_utf8(self.parent.name()).unwrap_or("*NAME NON UTF-8*"),
from_utf8(self.md_tag).unwrap_or("*MD TAG NON UTF-8*")));
}
panic!("Record {}: Failed to parse MD tag: {}. Reached the end of MD tag",
from_utf8(self.parent.name()).unwrap_or("*NAME NON UTF-8*"),
from_utf8(self.md_tag).unwrap_or("*MD TAG NON UTF-8*"));
}
}
impl<'a> Iterator for EntriesIter<'a> {
type Item = AlignmentEntry;
fn next(&mut self) -> Option<Self::Item> {
let pair = self.aligned_pairs.next()?;
match pair {
(Some(record_pos), Some(ref_pos)) => {
let record_nt = self.parent.sequence().at(record_pos as usize);
let ref_nt = self.curr_ref_nt(Some(record_nt));
return Some(AlignmentEntry { record_pos, record_nt, ref_pos, ref_nt });
},
(Some(record_pos), None) => {
let record_nt = self.parent.sequence().at(record_pos as usize);
return Some(AlignmentEntry {
record_pos, record_nt,
ref_pos: MISSING,
ref_nt: 0,
});
},
(None, Some(ref_pos)) => {
let ref_nt = self.curr_ref_nt(None);
return Some(AlignmentEntry {
ref_pos, ref_nt,
record_pos: MISSING,
record_nt: 0,
});
},
(None, None) => unreachable!(),
}
}
}