use std::io::{self, Read, ErrorKind, Write};
use std::io::ErrorKind::{InvalidData, UnexpectedEof};
use std::fmt::{self, Display, Debug, Formatter};
use std::cell::Cell;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
pub mod tags;
pub mod cigar;
pub mod sequence;
use super::header::Header;
use super::index;
pub use cigar::Cigar;
pub use sequence::Sequence;
pub use sequence::Qualities;
pub const RECORD_PAIRED: u16 = 0x1;
pub const ALL_SEGMENTS_ALIGNED: u16 = 0x2;
pub const RECORD_UNMAPPED: u16 = 0x4;
pub const MATE_UNMAPPED: u16 = 0x8;
pub const RECORD_REVERSE_STRAND: u16 = 0x10;
pub const MATE_REVERSE_STRAND: u16 = 0x20;
pub const FIRST_IN_PAIR: u16 = 0x40;
pub const LAST_IN_PAIR: u16 = 0x80;
pub const SECONDARY: u16 = 0x100;
pub const RECORD_FAILS_QC: u16 = 0x200;
pub const PCR_OR_OPTICAL_DUPLICATE: u16 = 0x400;
pub const SUPPLEMENTARY: u16 = 0x800;
#[derive(Clone, Copy, Debug)]
pub struct Flag(pub u16);
impl Flag {
pub fn is_paired(&self) -> bool {
self.0 & RECORD_PAIRED != 0
}
pub fn all_segments_aligned(&self) -> bool {
self.0 & ALL_SEGMENTS_ALIGNED != 0
}
pub fn is_mapped(&self) -> bool {
self.0 & RECORD_UNMAPPED == 0
}
pub fn mate_is_mapped(&self) -> bool {
self.0 & MATE_UNMAPPED == 0
}
pub fn is_reverse_strand(&self) -> bool {
self.0 & RECORD_REVERSE_STRAND != 0
}
pub fn mate_is_reverse_strand(&self) -> bool {
self.0 & MATE_REVERSE_STRAND != 0
}
pub fn first_in_pair(&self) -> bool {
self.0 & FIRST_IN_PAIR != 0
}
pub fn last_in_pair(&self) -> bool {
self.0 & LAST_IN_PAIR != 0
}
pub fn is_secondary(&self) -> bool {
self.0 & SECONDARY != 0
}
pub fn fails_quality_controls(&self) -> bool {
self.0 & RECORD_FAILS_QC != 0
}
pub fn is_duplicate(&self) -> bool {
self.0 & PCR_OR_OPTICAL_DUPLICATE != 0
}
pub fn is_supplementary(&self) -> bool {
self.0 & SUPPLEMENTARY != 0
}
pub fn set_paired(&mut self, paired: bool) {
if paired {
self.0 |= RECORD_PAIRED;
} else {
self.0 &= !RECORD_PAIRED;
}
}
pub fn set_all_segments_aligned(&mut self, all_segments_aligned: bool) {
if all_segments_aligned {
self.0 |= ALL_SEGMENTS_ALIGNED;
} else {
self.0 &= !ALL_SEGMENTS_ALIGNED;
}
}
pub fn set_mapped(&mut self, mapped: bool) {
if mapped {
self.0 &= !RECORD_UNMAPPED;
} else {
self.0 |= RECORD_UNMAPPED;
}
}
pub fn set_mate_mapped(&mut self, mate_mapped: bool) {
if mate_mapped {
self.0 &= !MATE_UNMAPPED;
} else {
self.0 |= MATE_UNMAPPED;
}
}
pub fn set_strand(&mut self, forward_strand: bool) {
if forward_strand {
self.0 &= !RECORD_REVERSE_STRAND;
} else {
self.0 |= RECORD_REVERSE_STRAND;
}
}
pub fn set_mate_strand(&mut self, forward_strand: bool) {
if forward_strand {
self.0 &= !MATE_REVERSE_STRAND;
} else {
self.0 |= MATE_REVERSE_STRAND;
}
}
pub fn set_first_in_pair(&mut self, is_first: bool) {
if is_first {
self.0 |= FIRST_IN_PAIR;
} else {
self.0 &= !FIRST_IN_PAIR;
}
}
pub fn set_last_in_pair(&mut self, is_last: bool) {
if is_last {
self.0 |= LAST_IN_PAIR;
} else {
self.0 &= !LAST_IN_PAIR;
}
}
pub fn set_secondary(&mut self, is_secondary: bool) {
if is_secondary {
self.0 |= SECONDARY;
} else {
self.0 &= !SECONDARY;
}
}
pub fn make_fail_quality_controls(&mut self, fails: bool) {
if fails {
self.0 |= RECORD_FAILS_QC;
} else {
self.0 &= !RECORD_FAILS_QC;
}
}
pub fn set_duplicate(&mut self, is_duplicate: bool) {
if is_duplicate {
self.0 |= PCR_OR_OPTICAL_DUPLICATE;
} else {
self.0 &= !PCR_OR_OPTICAL_DUPLICATE;
}
}
pub fn set_supplementary(&mut self, is_supplementary: bool) {
if is_supplementary {
self.0 |= SUPPLEMENTARY;
} else {
self.0 &= !SUPPLEMENTARY;
}
}
}
pub enum Error {
NoMoreRecords,
Corrupted(String),
Truncated(io::Error),
}
impl From<io::Error> for Error {
fn from(e: io::Error) -> Error {
Error::Truncated(e)
}
}
impl From<&str> for Error {
fn from(e: &str) -> Error {
Error::Corrupted(e.to_string())
}
}
impl Debug for Error {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
match self {
Error::NoMoreRecords => write!(f, "No more records"),
Error::Corrupted(e) => write!(f, "Corrupted record: {}", e),
Error::Truncated(e) => write!(f, "Truncated record: {}", e),
}
}
}
impl Display for Error {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
match self {
Error::NoMoreRecords => write!(f, "No more records"),
Error::Corrupted(e) => write!(f, "Corrupted record: {}", e),
Error::Truncated(e) => write!(f, "Truncated record: {}", e),
}
}
}
pub(crate) unsafe fn resize<T>(v: &mut Vec<T>, new_len: usize) {
if v.capacity() < new_len {
v.reserve(new_len - v.len());
}
v.set_len(new_len);
}
pub(crate) fn write_iterator<W, I>(writer: &mut W, mut iterator: I) -> io::Result<()>
where W: Write,
I: Iterator<Item = u8>,
{
const SIZE: usize = 1024;
let mut buffer = [0_u8; SIZE];
loop {
for i in 0..SIZE {
match iterator.next() {
Some(value) => buffer[i] = value,
None => {
return writer.write_all(&buffer[..i]);
}
}
}
writer.write_all(&buffer)?;
}
}
trait NextToErr<'a> {
fn try_next(&mut self, field: &'static str) -> Result<&'a str, Error>;
}
impl<'a> NextToErr<'a> for std::str::Split<'a, char> {
fn try_next(&mut self, field: &'static str) -> Result<&'a str, Error> {
self.next().ok_or_else(|| Error::Truncated(io::Error::new(UnexpectedEof,
format!("Cannot extract {}", field))))
}
}
pub struct Record {
ref_id: i32,
mate_ref_id: i32,
start: i32,
end: Cell<i32>,
mate_start: i32,
bin: Cell<u16>,
mapq: u8,
flag: Flag,
template_len: i32,
name: Vec<u8>,
cigar: Cigar,
seq: Sequence,
qual: Qualities,
tags: tags::TagViewer,
}
const BIN_UNKNOWN: u16 = std::u16::MAX;
impl Record {
pub fn new() -> Record {
Record {
ref_id: -1,
mate_ref_id: -1,
start: -1,
end: Cell::new(0),
mate_start: -1,
bin: Cell::new(BIN_UNKNOWN),
mapq: 0,
flag: Flag(0),
template_len: 0,
name: Vec::new(),
cigar: Cigar::new(),
seq: Sequence::new(),
qual: Qualities::new(),
tags: tags::TagViewer::new(),
}
}
pub fn clear(&mut self) {
self.ref_id = -1;
self.mate_ref_id = -1;
self.start = -1;
self.end.set(0);
self.mate_start = -1;
self.bin.set(BIN_UNKNOWN);
self.mapq = 0;
self.flag.0 = 0;
self.template_len = 0;
self.name.clear();
self.cigar.clear();
self.seq.clear();
self.qual.clear();
self.tags.clear();
}
fn corrupt(&mut self, text: String) -> Error {
if self.name.is_empty() {
Error::Corrupted(format!("Record {}: {}",
std::str::from_utf8(&self.name).unwrap_or("_"), text))
} else {
Error::Corrupted(text)
}
}
pub(crate) fn fill_from_bam<R: Read>(&mut self, stream: &mut R) -> Result<(), Error> {
self.name.clear();
let block_size = match stream.read_i32::<LittleEndian>() {
Ok(value) => {
if value < 0 {
return Err(self.corrupt("Negative block size".to_string()));
}
value as usize
},
Err(e) => {
return Err(if e.kind() == ErrorKind::UnexpectedEof {
Error::NoMoreRecords
} else {
Error::from(e)
})
},
};
self.set_ref_id(stream.read_i32::<LittleEndian>()?)?;
self.end.set(0);
self.set_start(stream.read_i32::<LittleEndian>()?)?;
let name_len = stream.read_u8()?;
if name_len == 0 {
return Err(self.corrupt("Name length == 0".to_string()));
}
self.set_mapq(stream.read_u8()?);
self.bin.set(stream.read_u16::<LittleEndian>()?);
let cigar_len = stream.read_u16::<LittleEndian>()?;
self.set_flag(stream.read_u16::<LittleEndian>()?);
let qual_len = stream.read_i32::<LittleEndian>()?;
if qual_len < 0 {
return Err(self.corrupt("Negative sequence length".to_string()));
}
let qual_len = qual_len as usize;
self.set_mate_ref_id(stream.read_i32::<LittleEndian>()?)?;
self.set_mate_start(stream.read_i32::<LittleEndian>()?)?;
self.set_template_len(stream.read_i32::<LittleEndian>()?);
unsafe {
resize(&mut self.name, name_len as usize - 1);
}
stream.read_exact(&mut self.name)?;
let _null_symbol = stream.read_u8()?;
self.cigar.fill_from(stream, cigar_len as usize)?;
self.seq.fill_from(stream, qual_len)?;
self.qual.fill_from(stream, qual_len)?;
let seq_len = (qual_len + 1) / 2;
let remaining_size = block_size - 32 - name_len as usize - 4 * cigar_len as usize
- seq_len - qual_len;
self.tags.fill_from(stream, remaining_size)?;
self.replace_cigar_if_needed()?;
if self.flag().is_mapped() == (self.ref_id == -1) {
return Err(self.corrupt("Record (flag & 0x4) and ref_id do not match".to_string()));
}
Ok(())
}
pub fn fill_from_sam(&mut self, line: &str, header: &Header) -> Result<(), Error> {
let mut split = line.split('\t');
self.set_name(split.try_next("record name (QNAME)")?.bytes());
let flag = split.try_next("flag")?;
let flag = flag.parse()
.map_err(|_| self.corrupt(format!("Cannot convert flag '{}' to int", flag)))?;
self.set_flag(flag);
let rname = split.try_next("reference name")?;
if rname == "*" {
self.set_ref_id(-1).unwrap();
} else {
let r_id = header.reference_id(rname).ok_or_else(||
self.corrupt(format!("Reference '{}' is not in the header", rname)))?;
self.set_ref_id(r_id as i32)?;
}
let start = split.try_next("start (POS)")?;
let start = start.parse::<i32>()
.map_err(|_| self.corrupt(format!("Cannot convert POS '{}' to int", start)))? - 1;
self.set_start(start)?;
let mapq = split.try_next("mapq")?;
let mapq = mapq.parse()
.map_err(|_| self.corrupt(format!("Cannot convert MAPQ '{}' to int", mapq)))?;
self.set_mapq(mapq);
self.set_cigar(split.try_next("CIGAR")?.bytes()).map_err(|e| self.corrupt(e))?;
let rnext = split.try_next("mate reference name (RNEXT)")?;
if rnext == "*" {
self.set_mate_ref_id(-1).unwrap();
} else if rnext == "=" {
self.set_mate_ref_id(self.ref_id).unwrap();
} else {
let nr_id = header.reference_id(rnext).ok_or_else(||
self.corrupt(format!("Reference '{}' is not in the header", rnext)))?;
self.set_mate_ref_id(nr_id as i32)?;
}
let mate_start = split.try_next("mate start (PNEXT)")?;
let mate_start = mate_start.parse::<i32>().map_err(|_|
self.corrupt(format!("Cannot convert PNEXT '{}' to int", mate_start)))? - 1;
self.set_mate_start(mate_start)?;
let template_len = split.try_next("template length (TLEN)")?;
let template_len = template_len.parse::<i32>().map_err(|_|
self.corrupt(format!("Cannot convert TLEN '{}' to int", template_len)))?;
self.set_template_len(template_len);
let seq = split.try_next("sequence")?;
let qual = split.try_next("qualities")?;
if seq == "*" {
self.reset_seq();
} else if qual == "*" {
self.set_seq(seq.bytes()).map_err(|e| self.corrupt(e))?;
} else {
self.set_seq_qual(seq.bytes(), qual.bytes().map(|q| q - 33))
.map_err(|e| self.corrupt(e))?;
}
self.tags.clear();
for tag in split {
self.tags.push_sam(tag)?;
}
Ok(())
}
fn replace_cigar_if_needed(&mut self) -> Result<(), Error> {
if self.cigar.len() > 0 && self.cigar.at(0) ==
(self.seq.len() as u32, cigar::Operation::Soft) {
if self.cigar.len() != 2 {
return Err(self.corrupt("Record contains invalid Cigar".to_string()));
}
let (len, op) = self.cigar.at(1);
if op != cigar::Operation::Skip {
return Err(self.corrupt("Record contains invalid Cigar".to_string()));
}
self.end.set(self.start + len as i32);
let cigar_arr = match self.tags.get(b"CG") {
Some(tags::TagValue::IntArray(array_view)) => {
if array_view.int_type() != tags::IntegerType::U32 {
return Err(self.corrupt("CG tag has an incorrect type".to_string()));
}
array_view
},
_ => return Err(
self.corrupt("Record should contain tag CG, but does not".to_string())),
};
self.cigar.clear();
self.cigar.extend_from_raw(cigar_arr.iter().map(|el| el as u32));
std::mem::drop(cigar_arr);
self.tags.remove(b"CG");
}
Ok(())
}
pub fn shrink_to_fit(&mut self) {
self.name.shrink_to_fit();
self.cigar.shrink_to_fit();
self.seq.shrink_to_fit();
self.qual.shrink_to_fit();
self.tags.shrink_to_fit();
}
pub fn name(&self) -> &[u8] {
&self.name
}
pub fn sequence(&self) -> &Sequence {
&self.seq
}
pub fn qualities(&self) -> Option<&Qualities> {
if self.qual.unavailable() {
None
} else {
Some(&self.qual)
}
}
pub fn cigar(&self) -> &Cigar {
&self.cigar
}
pub fn ref_id(&self) -> i32 {
self.ref_id
}
pub fn start(&self) -> i32 {
self.start
}
pub fn calculate_end(&self) -> i32 {
if self.cigar.len() == 0 {
return 0;
}
let end = self.end.get();
if end != 0 {
return end;
}
let end = self.start + self.cigar.calculate_ref_len() as i32;
self.end.set(end);
end
}
pub fn calculate_bin(&self) -> u16 {
let bin = self.bin.get();
if bin != BIN_UNKNOWN {
return bin;
}
let end = self.calculate_end();
let bin = index::region_to_bin(self.start, end) as u16;
self.bin.set(bin);
bin
}
pub fn mapq(&self) -> u8 {
self.mapq
}
pub fn mate_ref_id(&self) -> i32 {
self.mate_ref_id
}
pub fn mate_start(&self) -> i32 {
self.mate_start
}
pub fn template_len(&self) -> i32 {
self.template_len
}
pub fn tags(&self) -> &tags::TagViewer {
&self.tags
}
pub fn tags_mut(&mut self) -> &mut tags::TagViewer {
&mut self.tags
}
pub fn write_sam<W: Write>(&self, f: &mut W, header: &Header) -> io::Result<()> {
f.write_all(&self.name)?;
write!(f, "\t{}\t", self.flag.0)?;
if self.ref_id < 0 {
f.write_all(b"*\t")?;
} else {
write!(f, "{}\t", header.reference_name(self.ref_id as usize)
.ok_or_else(|| io::Error::new(InvalidData,
"Record has a reference id not in the header"))?)?;
}
write!(f, "{}\t{}\t", self.start + 1, self.mapq)?;
self.cigar.write_readable(f)?;
if self.mate_ref_id < 0 {
f.write_all(b"\t*\t")?;
} else if self.mate_ref_id == self.ref_id {
f.write_all(b"\t=\t")?;
} else {
write!(f, "\t{}\t", header.reference_name(self.mate_ref_id as usize)
.ok_or_else(|| io::Error::new(InvalidData,
"Record has a reference id not in the header"))?)?;
}
write!(f, "{}\t{}\t", self.mate_start + 1, self.template_len)?;
self.seq.write_readable(f)?;
f.write_u8(b'\t')?;
self.qual.write_readable(f)?;
self.tags.write_sam(f)?;
writeln!(f)
}
pub fn write_bam<W: Write>(&self, stream: &mut W) -> io::Result<()> {
let raw_cigar_len = if self.cigar.len() <= 0xffff {
4 * self.cigar.len()
} else {
16 + 4 * self.cigar.len()
};
let total_block_len = 32 + self.name.len() + 1 + raw_cigar_len + self.seq.raw().len()
+ self.qual.len() + self.tags.raw().len();
stream.write_i32::<LittleEndian>(total_block_len as i32)?;
stream.write_i32::<LittleEndian>(self.ref_id)?;
stream.write_i32::<LittleEndian>(self.start)?;
stream.write_u8(self.name.len() as u8 + 1)?;
stream.write_u8(self.mapq)?;
stream.write_u16::<LittleEndian>(self.calculate_bin())?;
if self.cigar.len() <= 0xffff {
stream.write_u16::<LittleEndian>(self.cigar.len() as u16)?;
} else {
stream.write_u16::<LittleEndian>(2)?;
}
stream.write_u16::<LittleEndian>(self.flag.0)?;
stream.write_i32::<LittleEndian>(self.seq.len() as i32)?;
stream.write_i32::<LittleEndian>(self.mate_ref_id)?;
stream.write_i32::<LittleEndian>(self.mate_start)?;
stream.write_i32::<LittleEndian>(self.template_len)?;
stream.write_all(&self.name)?;
stream.write_u8(0)?;
if self.cigar.len() <= 0xffff {
for &el in self.cigar.raw() {
stream.write_u32::<LittleEndian>(el)?;
}
} else {
let seq_len = if self.seq.len() != 0 {
self.seq.len() as u32
} else {
self.cigar.calculate_query_len()
};
stream.write_u32::<LittleEndian>(seq_len << 4 | 4)?;
let ref_len = (self.calculate_end() - self.start) as u32;
stream.write_u32::<LittleEndian>(ref_len << 4 | 3)?;
}
stream.write_all(&self.seq.raw())?;
stream.write_all(&self.qual.raw())?;
stream.write_all(self.tags.raw())?;
if self.cigar.len() > 0xffff {
stream.write_all(b"CGBI")?;
stream.write_i32::<LittleEndian>(self.cigar.len() as i32)?;
for &el in self.cigar.raw() {
stream.write_u32::<LittleEndian>(el)?;
}
}
Ok(())
}
pub fn set_name<T: IntoIterator<Item = u8>>(&mut self, name: T) {
self.name.clear();
self.name.extend(name.into_iter().take(254));
}
pub fn flag(&self) -> Flag {
self.flag
}
pub fn flag_mut(&mut self) -> &mut Flag {
&mut self.flag
}
pub fn set_flag(&mut self, flag: u16) {
self.flag.0 = flag;
}
pub fn set_ref_id(&mut self, ref_id: i32) -> Result<(), &'static str> {
if ref_id < -1 {
Err("Reference id < -1")
} else {
self.ref_id = ref_id;
Ok(())
}
}
pub fn set_start(&mut self, start: i32) -> Result<(), &'static str> {
if start < -1 {
Err("Start < -1")
} else {
let difference = start - self.start;
self.start = start;
if self.end.get() != 0 {
*self.end.get_mut() += difference;
}
self.bin.set(BIN_UNKNOWN);
Ok(())
}
}
pub fn set_mapq(&mut self, mapq: u8) {
self.mapq = mapq;
}
pub fn set_mate_ref_id(&mut self, mate_ref_id: i32) -> Result<(), &'static str> {
if mate_ref_id < -1 {
Err("Mate reference id < -1")
} else {
self.mate_ref_id = mate_ref_id;
Ok(())
}
}
pub fn set_mate_start(&mut self, mate_start: i32) -> Result<(), &'static str> {
if mate_start < -1 {
Err("Mate start < -1")
} else {
self.mate_start = mate_start;
Ok(())
}
}
pub fn set_template_len(&mut self, template_len: i32) {
self.template_len = template_len;
}
pub fn reset_seq(&mut self) {
self.seq.clear();
self.qual.clear();
}
pub fn set_seq<T: IntoIterator<Item = u8>>(&mut self, sequence: T) -> Result<(), String> {
self.seq.clear();
if let Err(e) = self.seq.extend_from_text(sequence) {
self.seq.clear();
self.qual.clear();
return Err(e);
}
self.qual.clear();
self.qual.extend_from_raw(std::iter::repeat(0xff_u8).take(self.seq.len()));
Ok(())
}
pub fn set_seq_qual<T, U>(&mut self, sequence: T, qualities: U) -> Result<(), String>
where T: IntoIterator<Item = u8>,
U: IntoIterator<Item = u8>,
{
self.seq.clear();
if let Err(e) = self.seq.extend_from_text(sequence) {
self.seq.clear();
self.qual.clear();
return Err(e);
}
self.qual.clear();
self.qual.extend_from_raw(qualities);
if self.seq.len() != self.qual.len() {
let err = Err(format!("Trying to set sequence and qualities of different lengths: \
{} and {}", self.seq.len(), self.qual.len()));
self.seq.clear();
self.qual.clear();
err
} else {
Ok(())
}
}
pub fn set_raw_seq(&mut self, raw_seq: &[u8], len: usize) {
assert!(len + 1 / 2 == raw_seq.len(), "Sequence length can be {} or {}, but not {}",
raw_seq.len() * 2 - 1, raw_seq.len() * 2, len);
let mut slice = &raw_seq[..];
self.seq.fill_from(&mut slice, len).unwrap();
}
pub fn set_raw_seq_qual<U>(&mut self, raw_seq: &[u8], qualities: U) -> Result<(), String>
where U: IntoIterator<Item = u8>,
{
self.qual.clear();
self.qual.extend_from_raw(qualities);
let len = self.qual.len();
if len + 1 / 2 == raw_seq.len() {
self.seq.clear();
self.qual.clear();
return Err(format!("Sequence length can be {} or {}, but not {}",
raw_seq.len() * 2 - 1, raw_seq.len() * 2, len));
}
let mut slice = &raw_seq[..];
self.seq.fill_from(&mut slice, len).unwrap();
Ok(())
}
pub fn set_cigar<I: IntoIterator<Item = u8>>(&mut self, cigar: I) -> Result<(), String> {
self.end.set(0);
self.bin.set(BIN_UNKNOWN);
self.cigar.clear();
self.cigar.extend_from_text(cigar)
}
pub fn set_raw_cigar<I: IntoIterator<Item = u32>>(&mut self, cigar: I) {
self.end.set(0);
self.bin.set(BIN_UNKNOWN);
self.cigar.clear();
self.cigar.extend_from_raw(cigar);
}
pub fn aligned_pairs(&self) -> cigar::AlignedPairs {
self.cigar.aligned_pairs(self.start as u32)
}
pub fn matching_pairs(&self) -> cigar::MatchingPairs {
self.cigar.matching_pairs(self.start as u32)
}
}