1use std::collections::HashSet;
2use std::io::{ErrorKind, Result, Write};
3use std::path::{Path, PathBuf};
5
6use indexmap::IndexMap;
7
8#[cfg(feature = "python-extension")]
9use pyo3::prelude::*;
10
11use rust_htslib::bam::{index, Header, IndexedReader, Read};
12use rust_htslib::htslib;
13
14use crate::cli::DepthOptions;
15use crate::io::get_writer;
16use crate::utils::styled_progress_bar;
17
18fn add_extension(path: &mut PathBuf, extension: impl AsRef<Path>) {
19 match path.extension() {
20 Some(ext) => {
21 let mut ext = ext.to_os_string();
22 ext.push(".");
23 ext.push(extension.as_ref());
24 path.set_extension(ext)
25 }
26 None => path.set_extension(extension.as_ref()),
27 };
28}
29
30pub fn create_index(bam_path: &PathBuf) {
31 let mut csi = PathBuf::from(bam_path);
32 add_extension(&mut csi, "csi");
33 if Path::new(&csi).exists() {
34 return;
35 }
36 let mut bai = PathBuf::from(bam_path);
37 add_extension(&mut bai, "bai");
38 if Path::new(&bai).exists() {
39 return;
40 }
41 match index::build(bam_path, None, index::Type::Csi(14), 1) {
42 Err(e) => eprintln!("Error writing BAM index: {e:?}"),
43 Ok(_) => eprintln!("Successfully created BAM index"),
44 }
45}
46
47pub fn open_bam(
48 bam_path: &Option<PathBuf>,
49 cram_path: &Option<PathBuf>,
50 fasta_path: &Option<PathBuf>,
51 make_index: bool,
52) -> IndexedReader {
53 let bam_cram_path = match bam_path {
54 None => cram_path.as_ref().unwrap(),
55 &Some(_) => bam_path.as_ref().unwrap(),
56 };
57 if make_index {
58 create_index(bam_cram_path);
59 }
60 let mut reader = IndexedReader::from_path(bam_cram_path).unwrap();
61 if cram_path.is_some() && fasta_path.is_some() {
62 reader.set_reference(fasta_path.as_ref().unwrap()).unwrap();
63 }
64 reader
65}
66
67pub fn reads_from_bam<F: Fn()>(
68 seq_names: &HashSet<Vec<u8>>,
69 mut bam: IndexedReader,
70 callback: &Option<F>,
71) -> HashSet<Vec<u8>> {
72 let mut wanted_reads = HashSet::new();
73 let total = seq_names.len();
74 let progress_bar = styled_progress_bar(total, "Locating alignments");
75
76 for seq_name in seq_names {
77 if bam.fetch(seq_name).is_err() {
78 eprintln!("Sequence {:?} not found in BAM file", seq_name)
79 }
80
81 for read in bam
82 .rc_records()
83 .map(|x| x.expect("Failure parsing Bam file"))
84 .filter(|read| {
86 read.flags()
87 & (htslib::BAM_FUNMAP
88 | htslib::BAM_FSECONDARY
89 | htslib::BAM_FQCFAIL
90 | htslib::BAM_FDUP) as u16
91 == 0
92 })
93 {
94 wanted_reads.insert(read.qname().to_vec());
95 }
96
97 if let Some(cb) = callback {
98 cb()
99 }
100 progress_bar.inc(1);
101 }
102 progress_bar.finish();
103 wanted_reads
104}
105
106fn seq_lengths_from_header(
107 bam: &IndexedReader,
108 seq_names: &HashSet<Vec<u8>>,
109) -> IndexMap<String, usize> {
110 let header = Header::from_template(bam.header());
111 let mut seq_lengths: IndexMap<String, usize> = IndexMap::new();
112 for (_, records) in header.to_hashmap() {
113 for record in records {
114 if record.contains_key("SN") {
115 if !seq_names.is_empty() && !seq_names.contains(&record["SN"].as_bytes().to_vec()) {
116 continue;
117 }
118 seq_lengths
119 .entry(record["SN"].to_string())
120 .or_insert(record["LN"].parse::<usize>().unwrap());
121 }
122 }
123 }
124 seq_lengths
125}
126
127#[derive(Clone, Debug)]
128#[cfg_attr(feature = "python-extension", pyclass)]
129pub struct BinnedCov {
130 seq_name: String,
131 bins: Vec<f64>,
132 bin_count: usize,
133 last_bin: usize,
134 seq_length: usize,
135 step: usize,
136}
137
138#[cfg(feature = "python-extension")]
139#[pymethods]
140impl BinnedCov {
141 #[getter]
142 fn seq_name(&self) -> &str {
143 &self.seq_name
144 }
145 #[getter]
146 fn bins(&self) -> &Vec<f64> {
147 &self.bins
148 }
149 #[getter]
150 fn bin_count(&self) -> usize {
151 self.bin_count
152 }
153 #[getter]
154 fn last_bin(&self) -> usize {
155 self.last_bin
156 }
157 #[getter]
158 fn seq_length(&self) -> usize {
159 self.seq_length
160 }
161 #[getter]
162 fn step(&self) -> usize {
163 self.step
164 }
165}
166
167fn depth_to_bed(
168 raw_cov: Vec<usize>,
169 length: &usize,
170 step: usize,
171 seq_name: &String,
172 writer: &mut Box<dyn Write>,
173) -> Result<()> {
174 let mut bins: Vec<f64> = vec![];
175 let mut divisor = step;
176 let mut end: usize = 0;
177 let seq_length = length.to_owned();
178 for cov in raw_cov {
179 end += step;
180 if end > seq_length {
181 divisor -= end - seq_length;
182 }
183 bins.push(cov as f64 / divisor as f64);
184 }
185 let mut start = 0;
186 let mut end;
187 let bin_count = bins.len();
188 for bin in bins.iter().take(bin_count) {
189 end = start + step;
190 if end > seq_length {
191 end = seq_length;
192 }
193 let line = format!("{}\t{}\t{}\t{:.2}", seq_name, start, end, bin);
194 writeln!(writer, "{}", line)?;
195 start = end;
196 }
197 Ok(())
198}
199
200pub fn bed_from_bam<F: Fn()>(
201 seq_lengths: &IndexMap<String, usize>,
202 mut bam: IndexedReader,
203 options: &DepthOptions,
204 callback: &Option<F>,
205) {
206 let total = seq_lengths.len();
207 let progress_bar = styled_progress_bar(total, "Locating alignments");
208 let bin_size = options.bin_size;
209 let step = bin_size;
210 let mut writer = get_writer(&options.bed);
211 for (seq_name, length) in seq_lengths.clone() {
212 let mut raw_cov: Vec<usize> = vec![];
213 for _ in (0..length).step_by(step) {
214 raw_cov.push(0)
215 }
216 if bam.fetch(&seq_name).is_err() {
217 eprintln!("Sequence {:?} not found in BAM file", seq_name)
218 }
219 for p in bam.pileup() {
220 let pileup = p.unwrap();
221 let bin = pileup.pos() as usize / step;
222 raw_cov[bin] += pileup.depth() as usize;
223 }
224 if let Some(cb) = callback {
225 cb()
226 }
227 match depth_to_bed(raw_cov, &length, step, &seq_name, &mut writer) {
228 Err(err) if err.kind() == ErrorKind::BrokenPipe => return,
229 Err(err) => panic!("unable to write {} to bed file: {}", &seq_name, err),
230 Ok(_) => (),
231 };
232 progress_bar.inc(1);
233 }
234 progress_bar.finish();
235}
236
237fn depth_to_cov(raw_cov: Vec<usize>, length: &usize, step: usize, seq_name: &String) -> BinnedCov {
238 let mut bins: Vec<f64> = vec![];
239 let mut divisor = step;
240 let mut end: usize = 0;
241 let seq_length = length.to_owned();
242 for cov in raw_cov {
243 end += step;
244 if end > seq_length {
245 divisor -= end - seq_length;
246 }
247 bins.push(cov as f64 / divisor as f64);
248 }
249 BinnedCov {
250 seq_name: seq_name.to_owned(),
251 step,
252 bin_count: bins.len(),
253 bins,
254 seq_length,
255 last_bin: divisor,
256 }
257}
258
259pub fn depth_from_bam<F: Fn()>(
260 seq_lengths: &IndexMap<String, usize>,
261 mut bam: IndexedReader,
262 options: &DepthOptions,
263 callback: &Option<F>,
264) -> Vec<BinnedCov> {
265 let total = seq_lengths.len();
266 let progress_bar = styled_progress_bar(total, "Locating alignments");
267 let bin_size = options.bin_size;
268 let step = bin_size;
269 let mut binned_covs = vec![];
270 for (seq_name, length) in seq_lengths.clone() {
271 let mut raw_cov: Vec<usize> = vec![];
272 for _ in (0..length).step_by(step) {
273 raw_cov.push(0)
274 }
275 if bam.fetch(&seq_name).is_err() {
276 eprintln!("Sequence {:?} not found in BAM file", seq_name)
277 }
278 for p in bam.pileup() {
279 let pileup = p.unwrap();
280 let bin = pileup.pos() as usize / step;
281 raw_cov[bin] += pileup.depth() as usize;
282 }
283 if let Some(cb) = callback {
284 cb()
285 }
286 binned_covs.push(depth_to_cov(raw_cov, &length, step, &seq_name));
287 progress_bar.inc(1);
293 }
294 progress_bar.finish();
295 binned_covs
296}
297
298pub fn get_bed_file<F: Fn()>(
299 bam: IndexedReader,
300 seq_names: &HashSet<Vec<u8>>,
301 options: &DepthOptions,
302 callback: &Option<F>,
303) {
304 let seq_lengths = seq_lengths_from_header(&bam, seq_names);
305 bed_from_bam(&seq_lengths, bam, options, callback);
306}
307
308pub fn get_depth<F: Fn()>(
309 bam: IndexedReader,
310 seq_names: &HashSet<Vec<u8>>,
311 options: &DepthOptions,
312 callback: &Option<F>,
313) -> Vec<BinnedCov> {
314 let seq_lengths = seq_lengths_from_header(&bam, seq_names);
315 depth_from_bam(&seq_lengths, bam, options, callback)
316}