1#![warn(missing_docs)]
46pub mod bam_order;
47pub mod sequence_dict;
48pub mod wrappers;
49use bam_order::BamSortOrder;
50use derive_builder::*;
51use rand::prelude::*;
52use rust_htslib::bam::{
53 header::{Header, HeaderRecord},
54 record::{CigarString, Record},
55 HeaderView,
56};
57use rust_htslib::{
58 bam::{self, record::Aux},
59 errors::Error,
60};
61use sequence_dict::SequenceDict;
62use std::collections::HashMap;
63use std::rc::Rc;
64use std::{convert::TryFrom, path::Path};
65use tempfile::NamedTempFile;
66use wrappers::{AuxType, ReadGroupId, SampleName, Strand};
67
68const DEFAULT_SEED: usize = 42;
70const DEFAULT_BASES: [char; 4] = ['A', 'C', 'T', 'G'];
72
73#[derive(Debug)]
84pub struct BamBuilder {
85 pub read_length: usize,
87 pub base_quality: u8,
89 pub sample_name: SampleName,
91 pub read_group_id: ReadGroupId,
93 pub sort_order: BamSortOrder,
95 pub header: Header,
97 pub rng: StdRng,
99 pub records: Vec<Record>,
101 pub default_quals: String,
103 pub counter: usize,
105}
106
107impl BamBuilder {
108 pub fn new(
128 read_length: usize,
129 base_quality: u8,
130 sample_name: String,
131 read_group_id: Option<String>,
132 sort: BamSortOrder,
133 pseudo_sd: Option<SequenceDict>,
134 seed: Option<usize>,
135 ) -> BamBuilder {
136 let pseudo_sd = pseudo_sd.unwrap_or(SequenceDict::default());
137 let seed = seed.unwrap_or(DEFAULT_SEED);
138 let rng = StdRng::seed_from_u64(seed as u64);
139 let read_group_id = match read_group_id {
140 Some(rgi) => ReadGroupId(rgi),
141 None => ReadGroupId::default(),
142 };
143 let default_quals = std::iter::repeat(char::from(33 + base_quality))
144 .take(read_length)
145 .collect::<String>();
146
147 let mut header: Header = pseudo_sd.into();
148
149 let mut header_record = HeaderRecord::new("RG".as_bytes());
150 header_record.push_tag("ID".as_bytes(), &read_group_id.0);
151 header.push_record(&header_record);
152
153 BamBuilder {
154 read_length,
155 base_quality,
156 sample_name: SampleName(sample_name),
157 read_group_id,
158 sort_order: sort,
159 header,
160 rng,
161 records: vec![],
162 default_quals,
163 counter: 0,
164 }
165 }
166
167 fn next_name(&mut self) -> String {
169 let name = format!("{:0>16}", self.counter);
170 self.counter += 1;
171 name
172 }
173
174 fn random_bases(&mut self) -> String {
176 (0..self.read_length)
177 .map(|_| DEFAULT_BASES[self.rng.random_range(0..DEFAULT_BASES.len())])
178 .collect::<String>()
179 }
180
181 pub fn frag_builder(&mut self) -> ReadFragSpecBuilder {
183 ReadFragSpecBuilder::default()
184 .name(self.next_name())
185 .bases(self.random_bases())
186 .quals(self.default_quals.clone())
187 .contig(-1)
188 .start(-1)
189 .unmapped(true)
190 .cigar(format!("{}M", &self.read_length))
191 .mapq(60)
192 .strand(Strand::Plus)
193 .attrs(HashMap::new())
194 .to_owned()
195 }
196
197 pub fn add_frag(&mut self, frag_spec: ReadFragSpec) {
199 assert!(
200 frag_spec.bases.is_empty()
201 || frag_spec.quals.is_empty()
202 || frag_spec.bases.len() == frag_spec.quals.len(),
203 "bases and quals were different lengths."
204 );
205
206 let cigar = CigarString::try_from(frag_spec.cigar.as_str()).expect("Malformed cigar");
207 assert!(
208 frag_spec.unmapped
209 || frag_spec.bases.is_empty()
210 || frag_spec.bases.len() == cigar.clone().into_view(0).end_pos() as usize,
211 "bases doesn't agree with cigar on length"
212 );
213 assert!(
214 frag_spec.unmapped
215 || frag_spec.quals.is_empty()
216 || frag_spec.quals.len() == cigar.clone().into_view(0).end_pos() as usize,
217 "quals doesn't agree with cigar on length"
218 );
219
220 let mut r = Record::new();
221 let cigar = CigarString::try_from(frag_spec.cigar.as_str()).unwrap();
222 r.set(
223 frag_spec.name.into_bytes().as_ref(),
224 if !frag_spec.unmapped {
225 Some(&cigar)
226 } else {
227 None
228 },
229 frag_spec.bases.into_bytes().as_ref(),
230 frag_spec.quals.into_bytes().as_ref(),
231 );
232 r.set_header(Rc::new(HeaderView::from_header(&self.header)));
233 r.set_tid(frag_spec.contig);
234 r.set_pos(frag_spec.start);
235 if !frag_spec.unmapped {
236 r.set_mapq(frag_spec.mapq)
237 }
238 match frag_spec.strand {
239 Strand::Plus => (),
240 Strand::Minus => r.set_reverse(),
241 }
242 if frag_spec.unmapped {
243 r.set_unmapped();
244 }
245 r.push_aux("RG".as_bytes(), Aux::String(self.read_group_id.0.as_str()))
246 .unwrap();
247 for (key, value) in frag_spec.attrs.iter() {
248 r.push_aux(key.as_bytes(), value.into()).unwrap();
249 }
250 self.records.push(r);
251 }
252
253 pub fn pair_builder(&mut self) -> ReadPairSpecBuilder {
255 ReadPairSpecBuilder::default()
256 .name(self.next_name())
257 .bases1(self.random_bases())
258 .bases2(self.random_bases())
259 .quals1(self.default_quals.clone())
260 .quals2(self.default_quals.clone())
261 .contig(-1)
262 .start1(-1) .start2(-1)
264 .unmapped1(true)
265 .unmapped2(true)
266 .cigar1(format!("{}M", &self.read_length))
267 .cigar2(format!("{}M", &self.read_length))
268 .mapq1(60)
269 .mapq2(60)
270 .strand1(Strand::Plus)
271 .strand2(Strand::Minus)
272 .attrs(HashMap::new())
273 .to_owned()
274 }
275
276 pub fn add_pair(&mut self, pair_spec: ReadPairSpec) {
278 assert!(
279 pair_spec.bases1.is_empty()
280 || pair_spec.quals1.is_empty()
281 || pair_spec.bases1.len() == pair_spec.quals1.len(),
282 "bases1 and quals1 were different lengths."
283 );
284 assert!(
285 pair_spec.bases2.is_empty()
286 || pair_spec.quals2.is_empty()
287 || pair_spec.bases2.len() == pair_spec.quals2.len(),
288 "bases2 and quals2 were different lengths."
289 );
290
291 let cigar1 = CigarString::try_from(pair_spec.cigar1.as_str()).expect("Malformed cigar1");
292 let cigar2 = CigarString::try_from(pair_spec.cigar2.as_str()).expect("Malformed cigar2");
293 assert!(
294 pair_spec.unmapped1
295 || pair_spec.bases1.is_empty()
296 || pair_spec.bases1.len() == cigar1.clone().into_view(0).end_pos() as usize,
297 "bases1 doesn't agree with cigar on length"
298 );
299 assert!(
300 pair_spec.unmapped1
301 || pair_spec.bases2.is_empty()
302 || pair_spec.bases2.len() == cigar2.clone().into_view(0).end_pos() as usize,
303 "bases2 doesn't agree with cigar on length"
304 );
305 assert!(
306 pair_spec.unmapped1
307 || pair_spec.quals1.is_empty()
308 || pair_spec.quals1.len() == cigar1.clone().into_view(0).end_pos() as usize,
309 "quals1 doesn't agree with cigar on length"
310 );
311 assert!(
312 pair_spec.unmapped2
313 || pair_spec.quals2.is_empty()
314 || pair_spec.quals2.len() == cigar2.clone().into_view(0).end_pos() as usize,
315 "quals2 doesn't agree with cigar on length"
316 );
317
318 let mut r1 = Record::new();
319 let cigar = CigarString::try_from(pair_spec.cigar1.as_str()).unwrap();
320 r1.set(
321 pair_spec.name.into_bytes().as_ref(),
322 if !pair_spec.unmapped1 {
323 Some(&cigar)
324 } else {
325 None
326 },
327 pair_spec.bases1.into_bytes().as_ref(),
328 pair_spec.quals1.into_bytes().as_ref(),
329 );
330 r1.set_header(Rc::new(HeaderView::from_header(&self.header)));
331 r1.set_tid(pair_spec.contig);
332 r1.set_pos(pair_spec.start1);
333 if !pair_spec.unmapped1 {
334 r1.set_mapq(pair_spec.mapq1)
335 }
336 match pair_spec.strand1 {
337 Strand::Plus => (),
338 Strand::Minus => r1.set_reverse(),
339 }
340 r1.set_paired();
341 r1.set_first_in_template();
342 if pair_spec.unmapped1 {
343 r1.set_unmapped();
344 } else {
345 r1.unset_unmapped();
346 }
347
348 let mut r2 = Record::new();
349 let cigar = CigarString::try_from(pair_spec.cigar2.as_str()).unwrap();
350 r2.set(
351 r1.qname(),
352 if !pair_spec.unmapped2 {
353 Some(&cigar)
354 } else {
355 None
356 },
357 pair_spec.bases2.into_bytes().as_ref(),
358 pair_spec.quals2.into_bytes().as_ref(),
359 );
360 r2.set_header(Rc::new(HeaderView::from_header(&self.header)));
361 r2.set_tid(pair_spec.contig);
362 r2.set_pos(pair_spec.start2);
363 if !pair_spec.unmapped2 {
364 r2.set_mapq(pair_spec.mapq2)
365 }
366 match pair_spec.strand2 {
367 Strand::Plus => (),
368 Strand::Minus => r2.set_reverse(),
369 }
370 r2.set_paired();
371 r2.set_last_in_template();
372 if pair_spec.unmapped2 {
373 r2.set_unmapped();
374 } else {
375 r2.unset_unmapped();
376 }
377 r1.push_aux("RG".as_bytes(), Aux::String(self.read_group_id.0.as_str()))
378 .unwrap();
379 r2.push_aux("RG".as_bytes(), Aux::String(self.read_group_id.0.as_str()))
380 .unwrap();
381
382 for (key, value) in pair_spec.attrs.iter() {
383 r1.push_aux(key.as_bytes(), value.into()).unwrap();
384 r2.push_aux(key.as_bytes(), value.into()).unwrap();
385 }
386 BamBuilder::set_mate_info(&mut r1, &mut r2, true);
387 self.records.push(r1);
388 self.records.push(r2);
389 }
390
391 fn set_mate_info(rec1: &mut Record, rec2: &mut Record, set_mate_cigar: bool) {
393 if !rec1.is_unmapped() && !rec2.is_unmapped() {
396 rec1.set_mtid(rec2.tid());
397 rec1.set_mpos(rec2.pos());
398 if rec2.is_reverse() {
399 rec1.set_mate_reverse()
400 }
401 rec1.push_aux(b"MQ", Aux::U8(rec2.mapq())).unwrap();
402
403 rec2.set_mtid(rec1.tid());
404 rec2.set_mpos(rec1.pos());
405 if rec1.is_reverse() {
406 rec2.set_mate_reverse()
407 }
408 rec2.push_aux(b"MQ", Aux::U8(rec1.mapq())).unwrap();
409
410 if set_mate_cigar {
411 rec1.push_aux(b"MC", Aux::String(rec2.cigar().to_string().as_str()))
412 .unwrap();
413 rec2.push_aux(b"MC", Aux::String(rec1.cigar().to_string().as_str()))
414 .unwrap();
415 } } else if rec1.is_unmapped() && rec2.is_unmapped() {
417 rec1.set_tid(-1);
419 rec1.set_pos(-1); rec1.set_mtid(-1);
421 rec1.set_mpos(-1);
422 rec1.set_mate_unmapped();
423 rec1.set_insert_size(0);
424 if rec2.is_reverse() {
425 rec1.set_mate_reverse()
426 }
427
428 rec2.set_tid(-1);
429 rec2.set_pos(-1); rec2.set_mtid(-1);
431 rec2.set_mpos(-1);
432 rec2.set_mate_unmapped();
433 rec2.set_insert_size(0);
434 if rec1.is_reverse() {
435 rec2.set_mate_reverse()
436 }
437 } else if rec1.is_unmapped() {
438 rec1.set_tid(-1);
440 rec1.set_pos(-1); rec2.set_mtid(rec1.tid());
443 rec2.set_mpos(rec1.pos());
444 rec2.set_mate_unmapped();
445 rec2.set_insert_size(0);
446 if rec1.is_reverse() {
447 rec2.set_mate_reverse()
448 }
449
450 rec1.set_mtid(rec2.tid());
451 rec1.set_mpos(rec2.pos());
452 rec1.set_insert_size(0);
453 rec1.push_aux(b"MQ", Aux::U8(rec2.mapq())).unwrap();
454 if set_mate_cigar {
455 rec1.push_aux(b"MC", Aux::String(rec2.cigar().to_string().as_str()))
456 .unwrap();
457 }
458 if rec2.is_reverse() {
459 rec1.set_mate_reverse()
460 }
461 } else {
462 rec2.set_tid(-1);
464 rec2.set_pos(-1); rec1.set_mtid(rec2.tid());
467 rec1.set_mpos(rec2.pos());
468 rec1.set_mate_unmapped();
469 rec1.set_insert_size(0);
470 if rec2.is_reverse() {
471 rec1.set_mate_reverse()
472 }
473
474 rec2.set_mtid(rec1.tid());
475 rec2.set_mpos(rec1.pos());
476 rec2.set_insert_size(0);
477 rec2.push_aux(b"MQ", Aux::U8(rec1.mapq())).unwrap();
478 if set_mate_cigar {
479 rec2.push_aux(b"MC", Aux::String(rec1.cigar().to_string().as_str()))
480 .unwrap();
481 }
482 if rec1.is_reverse() {
483 rec2.set_mate_reverse()
484 }
485 }
486
487 let insert_size = BamBuilder::compute_insert_size(rec1, rec2);
488 rec1.set_insert_size(insert_size);
489 rec2.set_insert_size(insert_size);
490 }
491
492 fn compute_insert_size(rec1: &Record, rec2: &Record) -> i64 {
494 if rec1.is_unmapped() || rec2.is_unmapped() {
495 return 0;
496 }
497 if rec1.tid() != rec2.tid() {
498 return 0;
499 }
500
501 let rec1_5prime_pos = if rec1.is_reverse() {
502 rec1.cigar().end_pos()
503 } else {
504 rec1.pos()
505 };
506 let rec2_5prime_pos = if rec2.is_reverse() {
507 rec2.cigar().end_pos()
508 } else {
509 rec2.pos()
510 };
511
512 rec2_5prime_pos - rec1_5prime_pos }
514
515 pub fn to_path(&self, path: &Path) -> Result<(), Error> {
520 let mut writer = bam::Writer::from_path(path, &self.header, bam::Format::Bam)?;
521 for record in self.records.iter() {
522 writer.write(record)?;
523 }
524
525 Ok(())
526 }
527
528 pub fn to_tmp(&self) -> Result<NamedTempFile, Error> {
533 let tempfile = NamedTempFile::new().unwrap();
534 self.to_path(tempfile.path())?;
535 Ok(tempfile)
536 }
537
538 pub fn sort(&mut self) {
540 self.sort_order.sort(&mut self.records);
541 }
542}
543
544#[derive(Builder, Debug)]
547pub struct ReadPairSpec {
548 name: String,
550 bases1: String,
552 bases2: String,
554 quals1: String,
556 quals2: String,
558 contig: i32,
560 start1: i64,
562 start2: i64,
564 unmapped1: bool,
566 unmapped2: bool,
568 cigar1: String,
570 cigar2: String,
572 mapq1: u8,
574 mapq2: u8,
576 strand1: Strand,
578 strand2: Strand,
580 attrs: HashMap<String, AuxType>,
582}
583
584#[derive(Builder, Debug)]
586pub struct ReadFragSpec {
587 name: String,
589 bases: String,
591 quals: String,
593 contig: i32,
595 start: i64,
597 unmapped: bool,
599 cigar: String,
601 mapq: u8,
603 strand: Strand,
605 attrs: HashMap<String, AuxType>,
607}
608
609#[cfg(test)]
610mod tests {
611 use super::*;
612 use rust_htslib::bam::record::Aux;
613
614 #[test]
615 fn check_proper_pairs() {
616 let mut builder = BamBuilder::new(
617 100,
618 30,
619 String::from("Test"),
620 None,
621 BamSortOrder::NameSorted,
622 None,
623 None,
624 );
625
626 for i in 0..10 {
627 let pair = builder
628 .pair_builder()
629 .contig(i)
630 .start1(10)
631 .start2(200)
632 .unmapped1(false)
633 .unmapped2(false)
634 .build()
635 .unwrap();
636 builder.add_pair(pair);
637 }
638
639 assert_eq!(builder.records.len(), 20);
640 assert_eq!(builder.records[0].qname(), "0000000000000000".as_bytes());
642 assert_eq!(builder.records[1].qname(), "0000000000000000".as_bytes());
643 assert_eq!(builder.records[18].qname(), "0000000000000009".as_bytes());
644 assert_eq!(builder.records[19].qname(), "0000000000000009".as_bytes());
645 assert!(builder.records[0].is_paired());
647 assert!(builder.records[1].is_paired());
648 assert!(builder.records[0].is_first_in_template());
649 assert!(builder.records[1].is_last_in_template());
650 assert_eq!(builder.records[0].mtid(), builder.records[1].tid());
652 assert_eq!(builder.records[1].mtid(), builder.records[0].tid());
653 assert_eq!(builder.records[1].mpos(), builder.records[0].pos());
654 assert_eq!(builder.records[0].mpos(), builder.records[1].pos());
655 assert!(builder.records[0].is_mate_reverse());
657 assert!(!builder.records[1].is_mate_reverse());
658 assert_eq!(builder.records[0].aux(b"MQ").unwrap(), Aux::U8(60));
660 assert_eq!(builder.records[0].aux(b"MC").unwrap(), Aux::String("100M"));
661 assert_eq!(builder.records[0].aux(b"RG").unwrap(), Aux::String("A"));
663 assert_eq!(builder.records[1].aux(b"RG").unwrap(), Aux::String("A"));
664 assert_eq!(builder.records[0].insert_size(), 290);
666 assert_eq!(builder.records[1].insert_size(), 290);
667 }
668
669 #[test]
670 fn check_improper_pair() {
671 let mut builder = BamBuilder::new(
672 100,
673 30,
674 String::from("Test"),
675 None,
676 BamSortOrder::NameSorted,
677 None,
678 None,
679 );
680
681 for i in 0..10 {
682 let pair = builder
684 .pair_builder()
685 .contig(i)
686 .start1(10)
687 .unmapped1(false)
688 .build()
689 .unwrap();
690 builder.add_pair(pair);
691 }
692
693 assert_eq!(builder.records.len(), 20);
694 assert_eq!(builder.records[0].qname(), "0000000000000000".as_bytes());
696 assert_eq!(builder.records[1].qname(), "0000000000000000".as_bytes());
697 assert_eq!(builder.records[18].qname(), "0000000000000009".as_bytes());
698 assert_eq!(builder.records[19].qname(), "0000000000000009".as_bytes());
699 assert!(builder.records[0].is_paired());
701 assert!(builder.records[1].is_paired());
702 assert!(builder.records[0].is_first_in_template());
703 assert!(builder.records[1].is_last_in_template());
704 assert_eq!(builder.records[0].mtid(), builder.records[1].tid());
706 assert_eq!(builder.records[1].mtid(), builder.records[0].tid());
707 assert_eq!(builder.records[1].mpos(), builder.records[0].pos());
708 assert_eq!(builder.records[0].mpos(), builder.records[1].pos());
709 assert_eq!(builder.records[1].tid(), -1);
710 assert_eq!(builder.records[1].pos(), -1);
711 assert!(builder.records[0].is_mate_reverse());
713 assert!(!builder.records[1].is_mate_reverse());
714 assert_eq!(builder.records[0].aux(b"MQ").ok(), None);
716 assert_eq!(builder.records[1].aux(b"MQ").unwrap(), Aux::U8(60));
717 assert_eq!(builder.records[0].aux(b"MC").ok(), None);
718 assert_eq!(builder.records[1].aux(b"MC").unwrap(), Aux::String("100M"));
719 assert_eq!(builder.records[0].aux(b"RG").unwrap(), Aux::String("A"));
721 assert_eq!(builder.records[1].aux(b"RG").unwrap(), Aux::String("A"));
722 assert_eq!(builder.records[0].insert_size(), 0);
724 assert_eq!(builder.records[1].insert_size(), 0);
725 }
726
727 }