#![warn(missing_docs)]
pub mod bam_order;
pub mod sequence_dict;
pub mod wrappers;
use bam_order::BamSortOrder;
use derive_builder::*;
use rand::prelude::*;
use rust_htslib::bam::{
header::{Header, HeaderRecord},
record::{CigarString, Record},
HeaderView,
};
use rust_htslib::{
bam::{self, record::Aux},
errors::Error,
};
use sequence_dict::SequenceDict;
use std::collections::HashMap;
use std::rc::Rc;
use std::{convert::TryFrom, path::Path};
use tempfile::NamedTempFile;
use wrappers::{AuxType, ReadGroupId, SampleName, Strand};
const DEFAULT_SEED: usize = 42;
const DEFAULT_BASES: [char; 4] = ['A', 'C', 'T', 'G'];
#[derive(Debug)]
pub struct BamBuilder {
pub read_length: usize,
pub base_quality: u8,
pub sample_name: SampleName,
pub read_group_id: ReadGroupId,
pub sort_order: BamSortOrder,
pub header: Header,
pub rng: StdRng,
pub records: Vec<Record>,
pub default_quals: String,
pub counter: usize,
}
impl BamBuilder {
pub fn new(
read_length: usize,
base_quality: u8,
sample_name: String,
read_group_id: Option<String>,
sort: BamSortOrder,
pseudo_sd: Option<SequenceDict>,
seed: Option<usize>,
) -> BamBuilder {
let pseudo_sd = pseudo_sd.unwrap_or(SequenceDict::default());
let seed = seed.unwrap_or(DEFAULT_SEED);
let rng = StdRng::seed_from_u64(seed as u64);
let read_group_id = match read_group_id {
Some(rgi) => ReadGroupId(rgi),
None => ReadGroupId::default(),
};
let default_quals = std::iter::repeat(char::from(33 + base_quality))
.take(read_length)
.collect::<String>();
let mut header: Header = pseudo_sd.into();
let mut header_record = HeaderRecord::new("RG".as_bytes());
header_record.push_tag("ID".as_bytes(), &read_group_id.0);
header.push_record(&header_record);
BamBuilder {
read_length,
base_quality,
sample_name: SampleName(sample_name),
read_group_id,
sort_order: sort,
header,
rng,
records: vec![],
default_quals,
counter: 0,
}
}
fn next_name(&mut self) -> String {
let name = format!("{:0>16}", self.counter);
self.counter += 1;
name
}
fn random_bases(&mut self) -> String {
(0..self.read_length)
.map(|_| DEFAULT_BASES[self.rng.gen_range(0, DEFAULT_BASES.len())])
.collect::<String>()
}
pub fn frag_builder(&mut self) -> ReadFragSpecBuilder {
ReadFragSpecBuilder::default()
.name(self.next_name())
.bases(self.random_bases())
.quals(self.default_quals.clone())
.contig(-1)
.start(-1)
.unmapped(true)
.cigar(format!("{}M", &self.read_length))
.mapq(60)
.strand(Strand::Plus)
.attrs(HashMap::new())
.to_owned()
}
pub fn add_frag(&mut self, frag_spec: ReadFragSpec) {
assert!(
frag_spec.bases == ""
|| frag_spec.quals == ""
|| frag_spec.bases.len() == frag_spec.quals.len(),
"bases and quals were different lengths."
);
let cigar = CigarString::try_from(frag_spec.cigar.as_str()).expect("Malformed cigar");
assert!(
frag_spec.unmapped
|| frag_spec.bases == ""
|| frag_spec.bases.len() == cigar.clone().into_view(0).end_pos() as usize,
"bases doesn't agree with cigar on length"
);
assert!(
frag_spec.unmapped
|| frag_spec.quals == ""
|| frag_spec.quals.len() == cigar.clone().into_view(0).end_pos() as usize,
"quals doesn't agree with cigar on length"
);
let mut r = Record::new();
let cigar = CigarString::try_from(frag_spec.cigar.as_str()).unwrap();
r.set(
frag_spec.name.into_bytes().as_ref(),
if !frag_spec.unmapped {
Some(&cigar)
} else {
None
},
frag_spec.bases.into_bytes().as_ref(),
frag_spec.quals.into_bytes().as_ref(),
);
r.set_header(Rc::new(HeaderView::from_header(&self.header)));
r.set_tid(frag_spec.contig);
r.set_pos(frag_spec.start);
if !frag_spec.unmapped {
r.set_mapq(frag_spec.mapq)
}
match frag_spec.strand {
Strand::Plus => (),
Strand::Minus => r.set_reverse(),
}
if frag_spec.unmapped {
r.set_unmapped();
}
r.push_aux(
"RG".as_bytes(),
&Aux::String(&self.read_group_id.0.as_bytes()),
);
for (key, value) in frag_spec.attrs.iter() {
r.push_aux(key.as_bytes(), &value.into());
}
self.records.push(r);
}
pub fn pair_builder(&mut self) -> ReadPairSpecBuilder {
ReadPairSpecBuilder::default()
.name(self.next_name())
.bases1(self.random_bases())
.bases2(self.random_bases())
.quals1(self.default_quals.clone())
.quals2(self.default_quals.clone())
.contig(-1)
.start1(-1)
.start2(-1)
.unmapped1(true)
.unmapped2(true)
.cigar1(format!("{}M", &self.read_length))
.cigar2(format!("{}M", &self.read_length))
.mapq1(60)
.mapq2(60)
.strand1(Strand::Plus)
.strand2(Strand::Minus)
.attrs(HashMap::new())
.to_owned()
}
pub fn add_pair(&mut self, pair_spec: ReadPairSpec) {
assert!(
pair_spec.bases1 == ""
|| pair_spec.quals1 == ""
|| pair_spec.bases1.len() == pair_spec.quals1.len(),
"bases1 and quals1 were different lengths."
);
assert!(
pair_spec.bases2 == ""
|| pair_spec.quals2 == ""
|| pair_spec.bases2.len() == pair_spec.quals2.len(),
"bases2 and quals2 were different lengths."
);
let cigar1 = CigarString::try_from(pair_spec.cigar1.as_str()).expect("Malformed cigar1");
let cigar2 = CigarString::try_from(pair_spec.cigar2.as_str()).expect("Malformed cigar2");
assert!(
pair_spec.unmapped1
|| pair_spec.bases1 == ""
|| pair_spec.bases1.len() == cigar1.clone().into_view(0).end_pos() as usize,
"bases1 doesn't agree with cigar on length"
);
assert!(
pair_spec.unmapped1
|| pair_spec.bases2 == ""
|| pair_spec.bases2.len() == cigar2.clone().into_view(0).end_pos() as usize,
"bases2 doesn't agree with cigar on length"
);
assert!(
pair_spec.unmapped1
|| pair_spec.quals1 == ""
|| pair_spec.quals1.len() == cigar1.clone().into_view(0).end_pos() as usize,
"quals1 doesn't agree with cigar on length"
);
assert!(
pair_spec.unmapped2
|| pair_spec.quals2 == ""
|| pair_spec.quals2.len() == cigar2.clone().into_view(0).end_pos() as usize,
"quals2 doesn't agree with cigar on length"
);
let mut r1 = Record::new();
let cigar = CigarString::try_from(pair_spec.cigar1.as_str()).unwrap();
r1.set(
pair_spec.name.into_bytes().as_ref(),
if !pair_spec.unmapped1 {
Some(&cigar)
} else {
None
},
pair_spec.bases1.into_bytes().as_ref(),
pair_spec.quals1.into_bytes().as_ref(),
);
r1.set_header(Rc::new(HeaderView::from_header(&self.header)));
r1.set_tid(pair_spec.contig);
r1.set_pos(pair_spec.start1);
if !pair_spec.unmapped1 {
r1.set_mapq(pair_spec.mapq1)
}
match pair_spec.strand1 {
Strand::Plus => (),
Strand::Minus => r1.set_reverse(),
}
r1.set_paired();
r1.set_first_in_template();
if pair_spec.unmapped1 {
r1.set_unmapped();
}
let mut r2 = Record::new();
let cigar = CigarString::try_from(pair_spec.cigar2.as_str()).unwrap();
r2.set(
r1.qname(),
if !pair_spec.unmapped2 {
Some(&cigar)
} else {
None
},
pair_spec.bases2.into_bytes().as_ref(),
pair_spec.quals2.into_bytes().as_ref(),
);
r2.set_header(Rc::new(HeaderView::from_header(&self.header)));
r2.set_tid(pair_spec.contig);
r2.set_pos(pair_spec.start2);
if !pair_spec.unmapped2 {
r2.set_mapq(pair_spec.mapq2)
}
match pair_spec.strand2 {
Strand::Plus => (),
Strand::Minus => r2.set_reverse(),
}
r2.set_paired();
r2.set_last_in_template();
if pair_spec.unmapped2 {
r2.set_unmapped();
}
r1.push_aux(
"RG".as_bytes(),
&Aux::String(&self.read_group_id.0.as_bytes()),
);
r2.push_aux(
"RG".as_bytes(),
&Aux::String(&self.read_group_id.0.as_bytes()),
);
for (key, value) in pair_spec.attrs.iter() {
r1.push_aux(key.as_bytes(), &value.into());
r2.push_aux(key.as_bytes(), &value.into());
}
BamBuilder::set_mate_info(&mut r1, &mut r2, true);
self.records.push(r1);
self.records.push(r2);
}
fn set_mate_info(rec1: &mut Record, rec2: &mut Record, set_mate_cigar: bool) {
if !rec1.is_unmapped() && !rec2.is_unmapped() {
rec1.set_mtid(rec2.tid());
rec1.set_mpos(rec2.pos());
if rec2.is_reverse() {
rec1.set_mate_reverse()
}
rec1.push_aux(b"MQ", &Aux::Char(rec2.mapq()));
rec2.set_mtid(rec1.tid());
rec2.set_mpos(rec1.pos());
if rec1.is_reverse() {
rec2.set_mate_reverse()
}
rec2.push_aux(b"MQ", &Aux::Char(rec1.mapq()));
if set_mate_cigar {
rec1.push_aux(b"MC", &Aux::String(rec2.cigar().to_string().as_bytes()));
rec2.push_aux(b"MC", &Aux::String(rec1.cigar().to_string().as_bytes()));
}
} else if rec1.is_unmapped() && rec2.is_unmapped() {
rec1.set_tid(-1);
rec1.set_pos(-1);
rec1.set_mtid(-1);
rec1.set_mpos(-1);
rec1.set_mate_unmapped();
rec1.set_insert_size(0);
if rec2.is_reverse() {
rec1.set_mate_reverse()
}
rec2.set_tid(-1);
rec2.set_pos(-1);
rec2.set_mtid(-1);
rec2.set_mpos(-1);
rec2.set_mate_unmapped();
rec2.set_insert_size(0);
if rec1.is_reverse() {
rec2.set_mate_reverse()
}
} else if rec1.is_unmapped() {
rec1.set_tid(-1);
rec1.set_pos(-1);
rec2.set_mtid(rec1.tid());
rec2.set_mpos(rec1.pos());
rec2.set_mate_unmapped();
rec2.set_insert_size(0);
if rec1.is_reverse() {
rec2.set_mate_reverse()
}
rec1.set_mtid(rec2.tid());
rec1.set_mpos(rec2.pos());
rec1.set_insert_size(0);
rec1.push_aux(b"MQ", &Aux::Char(rec2.mapq()));
if set_mate_cigar {
rec1.push_aux(b"MC", &Aux::String(rec2.cigar().to_string().as_bytes()))
}
if rec2.is_reverse() {
rec1.set_mate_reverse()
}
} else {
rec2.set_tid(-1);
rec2.set_pos(-1);
rec1.set_mtid(rec2.tid());
rec1.set_mpos(rec2.pos());
rec1.set_mate_unmapped();
rec1.set_insert_size(0);
if rec2.is_reverse() {
rec1.set_mate_reverse()
}
rec2.set_mtid(rec1.tid());
rec2.set_mpos(rec1.pos());
rec2.set_insert_size(0);
rec2.push_aux(b"MQ", &Aux::Char(rec1.mapq()));
if set_mate_cigar {
rec2.push_aux(b"MC", &Aux::String(rec1.cigar().to_string().as_bytes()))
}
if rec1.is_reverse() {
rec2.set_mate_reverse()
}
}
let insert_size = BamBuilder::compute_insert_size(rec1, rec2);
rec1.set_insert_size(insert_size);
rec2.set_insert_size(insert_size);
}
fn compute_insert_size(rec1: &Record, rec2: &Record) -> i64 {
if rec1.is_unmapped() || rec2.is_unmapped() {
return 0;
}
if rec1.tid() != rec2.tid() {
return 0;
}
let rec1_5prime_pos = if rec1.is_reverse() {
rec1.cigar().end_pos()
} else {
rec1.pos()
};
let rec2_5prime_pos = if rec2.is_reverse() {
rec2.cigar().end_pos()
} else {
rec2.pos()
};
rec2_5prime_pos - rec1_5prime_pos
}
pub fn to_path(&self, path: &Path) -> Result<(), Error> {
let mut writer = bam::Writer::from_path(path, &self.header, bam::Format::BAM)?;
for record in self.records.iter() {
writer.write(record)?;
}
Ok(())
}
pub fn to_tmp(&self) -> Result<NamedTempFile, Error> {
let tempfile = NamedTempFile::new().unwrap();
self.to_path(tempfile.path())?;
Ok(tempfile)
}
pub fn sort(&mut self) {
self.sort_order.sort(&mut self.records);
}
}
#[derive(Builder, Debug)]
pub struct ReadPairSpec {
name: String,
bases1: String,
bases2: String,
quals1: String,
quals2: String,
contig: i32,
start1: i64,
start2: i64,
unmapped1: bool,
unmapped2: bool,
cigar1: String,
cigar2: String,
mapq1: u8,
mapq2: u8,
strand1: Strand,
strand2: Strand,
attrs: HashMap<String, AuxType>,
}
#[derive(Builder, Debug)]
pub struct ReadFragSpec {
name: String,
bases: String,
quals: String,
contig: i32,
start: i64,
unmapped: bool,
cigar: String,
mapq: u8,
strand: Strand,
attrs: HashMap<String, AuxType>,
}
#[cfg(test)]
mod tests {
use super::*;
use rust_htslib::bam::record::Aux;
#[test]
fn check_proper_pairs() {
let mut builder = BamBuilder::new(
100,
30,
String::from("Test"),
None,
BamSortOrder::NameSorted,
None,
None,
);
for i in 0..10 {
let pair = builder
.pair_builder()
.contig(i)
.start1(10)
.start2(200)
.unmapped1(false)
.unmapped2(false)
.build()
.unwrap();
builder.add_pair(pair);
}
assert_eq!(builder.records.len(), 20);
assert_eq!(builder.records[0].qname(), "0000000000000000".as_bytes());
assert_eq!(builder.records[1].qname(), "0000000000000000".as_bytes());
assert_eq!(builder.records[18].qname(), "0000000000000009".as_bytes());
assert_eq!(builder.records[19].qname(), "0000000000000009".as_bytes());
assert!(builder.records[0].is_paired());
assert!(builder.records[1].is_paired());
assert!(builder.records[0].is_first_in_template());
assert!(builder.records[1].is_last_in_template());
assert_eq!(builder.records[0].mtid(), builder.records[1].tid());
assert_eq!(builder.records[1].mtid(), builder.records[0].tid());
assert_eq!(builder.records[1].mpos(), builder.records[0].pos());
assert_eq!(builder.records[0].mpos(), builder.records[1].pos());
assert!(builder.records[0].is_mate_reverse());
assert!(!builder.records[1].is_mate_reverse());
assert_eq!(builder.records[0].aux(b"MQ").unwrap(), Aux::Char(60));
assert_eq!(builder.records[0].aux(b"MC").unwrap(), Aux::String(b"100M"));
assert_eq!(builder.records[0].aux(b"RG").unwrap(), Aux::String(b"A"));
assert_eq!(builder.records[1].aux(b"RG").unwrap(), Aux::String(b"A"));
}
#[test]
fn check_improper_pair() {
let mut builder = BamBuilder::new(
100,
30,
String::from("Test"),
None,
BamSortOrder::NameSorted,
None,
None,
);
for i in 0..10 {
let pair = builder
.pair_builder()
.contig(i)
.start1(10)
.unmapped1(false)
.build()
.unwrap();
builder.add_pair(pair);
}
assert_eq!(builder.records.len(), 20);
assert_eq!(builder.records[0].qname(), "0000000000000000".as_bytes());
assert_eq!(builder.records[1].qname(), "0000000000000000".as_bytes());
assert_eq!(builder.records[18].qname(), "0000000000000009".as_bytes());
assert_eq!(builder.records[19].qname(), "0000000000000009".as_bytes());
assert!(builder.records[0].is_paired());
assert!(builder.records[1].is_paired());
assert!(builder.records[0].is_first_in_template());
assert!(builder.records[1].is_last_in_template());
assert_eq!(builder.records[0].mtid(), builder.records[1].tid());
assert_eq!(builder.records[1].mtid(), builder.records[0].tid());
assert_eq!(builder.records[1].mpos(), builder.records[0].pos());
assert_eq!(builder.records[0].mpos(), builder.records[1].pos());
assert!(builder.records[0].is_mate_reverse());
assert!(!builder.records[1].is_mate_reverse());
assert_eq!(builder.records[0].aux(b"MQ"), None);
assert_eq!(builder.records[1].aux(b"MQ").unwrap(), Aux::Char(60));
assert_eq!(builder.records[0].aux(b"MC"), None);
assert_eq!(builder.records[1].aux(b"MC").unwrap(), Aux::String(b"100M"));
assert_eq!(builder.records[0].aux(b"RG").unwrap(), Aux::String(b"A"));
assert_eq!(builder.records[1].aux(b"RG").unwrap(), Aux::String(b"A"));
}
}