1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
use crate::util::oarfish_types::{InMemoryAlignmentStore, TranscriptInfo};
use noodles_bam as bam;
use noodles_sam::Header;
use num_format::{Locale, ToFormattedString};
use std::io;
use std::path::Path;
use tracing::{info, trace};
pub fn read_and_verify_header<R: io::BufRead>(
reader: &mut bam::io::Reader<R>,
aln_file: &Path,
) -> anyhow::Result<Header> {
// read the bam file header, print out some basic info
let header = reader.read_header()?;
info!(
"read header from BAM file {}, contains {} reference sequences.",
aln_file.display(),
header
.reference_sequences()
.len()
.to_formatted_string(&Locale::en)
);
let mut saw_minimap2 = false;
let mut progs = vec![];
// explicitly check that alignment was done with a supported
// aligner (right now, just minimap2).
for (prog, _pmap) in header.programs().roots() {
if prog == "minimap2" {
saw_minimap2 = true;
break;
} else {
progs.push(prog);
}
}
assert!(
saw_minimap2,
"Currently, only minimap2 is supported as an aligner. The bam file listed {:?}.",
progs
);
info!("saw minimap2 as a program in the header; proceeding.");
Ok(header)
}
pub fn parse_alignments<R: io::BufRead>(
store: &mut InMemoryAlignmentStore,
header: &Header,
reader: &mut bam::io::Reader<R>,
txps: &mut [TranscriptInfo],
) -> anyhow::Result<()> {
// we'll need these to keep track of which alignments belong
// to which reads.
let mut prev_read = String::new();
let mut num_unmapped = 0_u64;
let mut records_for_read = vec![];
// Parse the input alignemnt file, gathering the alignments aggregated
// by their source read. **Note**: this requires that we have a
// name-sorted input bam file (currently, aligned against the transcriptome).
//
// *NOTE*: this had to be changed from `records` to `record_bufs` or
// critical information was missing from the records. This happened when
// moving to the new version of noodles. Track `https://github.com/zaeleus/noodles/issues/230`
// to see if it's clear why this is the case
for (i, result) in reader.record_bufs(header).enumerate() {
let record = result?;
if i % 100_000 == 1 {
trace!("processed {i} alignment records");
}
// unmapped reads don't contribute to quantification
// but we track them.
if record.flags().is_unmapped() {
num_unmapped += 1;
continue;
}
let record_copy = record.clone();
if let Some(rname) = record.name() {
let rstring: String = String::from_utf8_lossy(rname.as_ref()).into_owned();
// if this is an alignment for the same read, then
// push it onto our temporary vector.
if prev_read == rstring {
if let Some(_ref_id) = record.reference_sequence_id() {
records_for_read.push(record_copy);
}
} else {
// otherwise, record the alignment range for the
// previous read record.
if !prev_read.is_empty() {
store.add_group(txps, &mut records_for_read);
if records_for_read.len() == 1 {
store.inc_unique_alignments();
}
records_for_read.clear();
}
// the new "prev_read" name is the current read name
// so it becomes the first on the new alignment range
// vector.
prev_read = rstring;
if let Some(_ref_id) = record.reference_sequence_id() {
records_for_read.push(record_copy);
}
}
}
}
// if we end with a non-empty alignment range vector, then
// add that group.
if !records_for_read.is_empty() {
store.add_group(txps, &mut records_for_read);
if records_for_read.len() == 1 {
store.inc_unique_alignments();
}
records_for_read.clear();
}
info!(
"the alignment file contained {} unmapped read records.",
num_unmapped.to_formatted_string(&Locale::en)
);
Ok(())
}