Skip to main content

blobtk/
bam.rs

1use std::collections::HashSet;
2use std::io::{ErrorKind, Result, Write};
3// use std::ops::Index;
4use std::path::{Path, PathBuf};
5
6use indexmap::IndexMap;
7
8#[cfg(feature = "python-extension")]
9use pyo3::prelude::*;
10
11use rust_htslib::bam::{index, Header, IndexedReader, Read};
12use rust_htslib::htslib;
13
14use crate::cli::DepthOptions;
15use crate::io::get_writer;
16use crate::utils::styled_progress_bar;
17
18fn add_extension(path: &mut PathBuf, extension: impl AsRef<Path>) {
19    match path.extension() {
20        Some(ext) => {
21            let mut ext = ext.to_os_string();
22            ext.push(".");
23            ext.push(extension.as_ref());
24            path.set_extension(ext)
25        }
26        None => path.set_extension(extension.as_ref()),
27    };
28}
29
30pub fn create_index(bam_path: &PathBuf) {
31    let mut csi = PathBuf::from(bam_path);
32    add_extension(&mut csi, "csi");
33    if Path::new(&csi).exists() {
34        return;
35    }
36    let mut bai = PathBuf::from(bam_path);
37    add_extension(&mut bai, "bai");
38    if Path::new(&bai).exists() {
39        return;
40    }
41    match index::build(bam_path, None, index::Type::Csi(14), 1) {
42        Err(e) => eprintln!("Error writing BAM index: {e:?}"),
43        Ok(_) => eprintln!("Successfully created BAM index"),
44    }
45}
46
47pub fn open_bam(
48    bam_path: &Option<PathBuf>,
49    cram_path: &Option<PathBuf>,
50    fasta_path: &Option<PathBuf>,
51    make_index: bool,
52) -> IndexedReader {
53    let bam_cram_path = match bam_path {
54        None => cram_path.as_ref().unwrap(),
55        &Some(_) => bam_path.as_ref().unwrap(),
56    };
57    if make_index {
58        create_index(bam_cram_path);
59    }
60    let mut reader = IndexedReader::from_path(bam_cram_path).unwrap();
61    if cram_path.is_some() && fasta_path.is_some() {
62        reader.set_reference(fasta_path.as_ref().unwrap()).unwrap();
63    }
64    reader
65}
66
67pub fn reads_from_bam<F: Fn()>(
68    seq_names: &HashSet<Vec<u8>>,
69    mut bam: IndexedReader,
70    callback: &Option<F>,
71) -> HashSet<Vec<u8>> {
72    let mut wanted_reads = HashSet::new();
73    let total = seq_names.len();
74    let progress_bar = styled_progress_bar(total, "Locating alignments");
75
76    for seq_name in seq_names {
77        if bam.fetch(seq_name).is_err() {
78            eprintln!("Sequence {:?} not found in BAM file", seq_name)
79        }
80
81        for read in bam
82            .rc_records()
83            .map(|x| x.expect("Failure parsing Bam file"))
84            // TODO: include filter options in config
85            .filter(|read| {
86                read.flags()
87                    & (htslib::BAM_FUNMAP
88                        | htslib::BAM_FSECONDARY
89                        | htslib::BAM_FQCFAIL
90                        | htslib::BAM_FDUP) as u16
91                    == 0
92            })
93        {
94            wanted_reads.insert(read.qname().to_vec());
95        }
96
97        if let Some(cb) = callback {
98            cb()
99        }
100        progress_bar.inc(1);
101    }
102    progress_bar.finish();
103    wanted_reads
104}
105
106fn seq_lengths_from_header(
107    bam: &IndexedReader,
108    seq_names: &HashSet<Vec<u8>>,
109) -> IndexMap<String, usize> {
110    let header = Header::from_template(bam.header());
111    let mut seq_lengths: IndexMap<String, usize> = IndexMap::new();
112    for (_, records) in header.to_hashmap() {
113        for record in records {
114            if record.contains_key("SN") {
115                if !seq_names.is_empty() && !seq_names.contains(&record["SN"].as_bytes().to_vec()) {
116                    continue;
117                }
118                seq_lengths
119                    .entry(record["SN"].to_string())
120                    .or_insert(record["LN"].parse::<usize>().unwrap());
121            }
122        }
123    }
124    seq_lengths
125}
126
127#[derive(Clone, Debug)]
128#[cfg_attr(feature = "python-extension", pyclass)]
129pub struct BinnedCov {
130    seq_name: String,
131    bins: Vec<f64>,
132    bin_count: usize,
133    last_bin: usize,
134    seq_length: usize,
135    step: usize,
136}
137
138#[cfg(feature = "python-extension")]
139#[pymethods]
140impl BinnedCov {
141    #[getter]
142    fn seq_name(&self) -> &str {
143        &self.seq_name
144    }
145    #[getter]
146    fn bins(&self) -> &Vec<f64> {
147        &self.bins
148    }
149    #[getter]
150    fn bin_count(&self) -> usize {
151        self.bin_count
152    }
153    #[getter]
154    fn last_bin(&self) -> usize {
155        self.last_bin
156    }
157    #[getter]
158    fn seq_length(&self) -> usize {
159        self.seq_length
160    }
161    #[getter]
162    fn step(&self) -> usize {
163        self.step
164    }
165}
166
167fn depth_to_bed(
168    raw_cov: Vec<usize>,
169    length: &usize,
170    step: usize,
171    seq_name: &String,
172    writer: &mut Box<dyn Write>,
173) -> Result<()> {
174    let mut bins: Vec<f64> = vec![];
175    let mut divisor = step;
176    let mut end: usize = 0;
177    let seq_length = length.to_owned();
178    for cov in raw_cov {
179        end += step;
180        if end > seq_length {
181            divisor -= end - seq_length;
182        }
183        bins.push(cov as f64 / divisor as f64);
184    }
185    let mut start = 0;
186    let mut end;
187    let bin_count = bins.len();
188    for bin in bins.iter().take(bin_count) {
189        end = start + step;
190        if end > seq_length {
191            end = seq_length;
192        }
193        let line = format!("{}\t{}\t{}\t{:.2}", seq_name, start, end, bin);
194        writeln!(writer, "{}", line)?;
195        start = end;
196    }
197    Ok(())
198}
199
200pub fn bed_from_bam<F: Fn()>(
201    seq_lengths: &IndexMap<String, usize>,
202    mut bam: IndexedReader,
203    options: &DepthOptions,
204    callback: &Option<F>,
205) {
206    let total = seq_lengths.len();
207    let progress_bar = styled_progress_bar(total, "Locating alignments");
208    let bin_size = options.bin_size;
209    let step = bin_size;
210    let mut writer = get_writer(&options.bed);
211    for (seq_name, length) in seq_lengths.clone() {
212        let mut raw_cov: Vec<usize> = vec![];
213        for _ in (0..length).step_by(step) {
214            raw_cov.push(0)
215        }
216        if bam.fetch(&seq_name).is_err() {
217            eprintln!("Sequence {:?} not found in BAM file", seq_name)
218        }
219        for p in bam.pileup() {
220            let pileup = p.unwrap();
221            let bin = pileup.pos() as usize / step;
222            raw_cov[bin] += pileup.depth() as usize;
223        }
224        if let Some(cb) = callback {
225            cb()
226        }
227        match depth_to_bed(raw_cov, &length, step, &seq_name, &mut writer) {
228            Err(err) if err.kind() == ErrorKind::BrokenPipe => return,
229            Err(err) => panic!("unable to write {} to bed file: {}", &seq_name, err),
230            Ok(_) => (),
231        };
232        progress_bar.inc(1);
233    }
234    progress_bar.finish();
235}
236
237fn depth_to_cov(raw_cov: Vec<usize>, length: &usize, step: usize, seq_name: &String) -> BinnedCov {
238    let mut bins: Vec<f64> = vec![];
239    let mut divisor = step;
240    let mut end: usize = 0;
241    let seq_length = length.to_owned();
242    for cov in raw_cov {
243        end += step;
244        if end > seq_length {
245            divisor -= end - seq_length;
246        }
247        bins.push(cov as f64 / divisor as f64);
248    }
249    BinnedCov {
250        seq_name: seq_name.to_owned(),
251        step,
252        bin_count: bins.len(),
253        bins,
254        seq_length,
255        last_bin: divisor,
256    }
257}
258
259pub fn depth_from_bam<F: Fn()>(
260    seq_lengths: &IndexMap<String, usize>,
261    mut bam: IndexedReader,
262    options: &DepthOptions,
263    callback: &Option<F>,
264) -> Vec<BinnedCov> {
265    let total = seq_lengths.len();
266    let progress_bar = styled_progress_bar(total, "Locating alignments");
267    let bin_size = options.bin_size;
268    let step = bin_size;
269    let mut binned_covs = vec![];
270    for (seq_name, length) in seq_lengths.clone() {
271        let mut raw_cov: Vec<usize> = vec![];
272        for _ in (0..length).step_by(step) {
273            raw_cov.push(0)
274        }
275        if bam.fetch(&seq_name).is_err() {
276            eprintln!("Sequence {:?} not found in BAM file", seq_name)
277        }
278        for p in bam.pileup() {
279            let pileup = p.unwrap();
280            let bin = pileup.pos() as usize / step;
281            raw_cov[bin] += pileup.depth() as usize;
282        }
283        if let Some(cb) = callback {
284            cb()
285        }
286        binned_covs.push(depth_to_cov(raw_cov, &length, step, &seq_name));
287        // match depth_to_bed(raw_cov, &length, step, &seq_name, &mut writer) {
288        //     Err(err) if err.kind() == ErrorKind::BrokenPipe => return,
289        //     Err(err) => panic!("unable to write {} to bed file: {}", &seq_name, err),
290        //     Ok(_) => (),
291        // };
292        progress_bar.inc(1);
293    }
294    progress_bar.finish();
295    binned_covs
296}
297
298pub fn get_bed_file<F: Fn()>(
299    bam: IndexedReader,
300    seq_names: &HashSet<Vec<u8>>,
301    options: &DepthOptions,
302    callback: &Option<F>,
303) {
304    let seq_lengths = seq_lengths_from_header(&bam, seq_names);
305    bed_from_bam(&seq_lengths, bam, options, callback);
306}
307
308pub fn get_depth<F: Fn()>(
309    bam: IndexedReader,
310    seq_names: &HashSet<Vec<u8>>,
311    options: &DepthOptions,
312    callback: &Option<F>,
313) -> Vec<BinnedCov> {
314    let seq_lengths = seq_lengths_from_header(&bam, seq_names);
315    depth_from_bam(&seq_lengths, bam, options, callback)
316}