bamnado/
bam_utils.rs

1use ahash::{HashMap, HashSet};
2use anyhow::{Context, Result};
3use bio_types::annot::contig::Contig;
4use bio_types::annot::loc::Loc;
5use bio_types::strand::ReqStrand;
6use log::{debug, info, warn};
7
8use noodles::core::{Position, Region};
9use noodles::{bam, sam, bed};
10use noodles::sam::alignment::record::data::field::tag::Tag;
11use polars::prelude::*;
12use rust_lapper::Interval;
13use rust_lapper::Lapper;
14use std::io::{BufRead, Write};
15use std::ops::Bound;
16use std::path::{Path, PathBuf};
17use regex::Regex;
18use std::str::FromStr;
19use bio_types::strand::Strand as BioStrand;
20
21pub const CB: [u8; 2] = [b'C', b'B'];
22pub type Iv = Interval<usize, u32>;
23
24
25
26#[derive(Debug, Clone, PartialEq, Eq, Copy)]
27pub enum Strand {
28    Forward,
29    Reverse,
30    Both,
31}
32impl FromStr for Strand {
33    type Err = String;
34
35    fn from_str(s: &str) -> Result<Self, Self::Err> {
36        match s.to_lowercase().as_str() {
37            "forward" => Ok(Strand::Forward),
38            "reverse" => Ok(Strand::Reverse),
39            "both" => Ok(Strand::Both),
40            _ => Err(format!("Invalid strand value: {}", s)),
41        }
42    }
43}
44impl std::fmt::Display for Strand {
45    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
46        match self {
47            Strand::Forward => write!(f, "forward"),
48            Strand::Reverse => write!(f, "reverse"),
49            Strand::Both => write!(f, "both"),
50        }
51    }
52}
53
54impl Into<BioStrand> for Strand {
55    fn into(self) -> BioStrand {
56        match self {
57            Strand::Forward => BioStrand::Forward,
58            Strand::Reverse => BioStrand::Reverse,
59            Strand::Both => BioStrand::Unknown,
60        }
61    }
62}
63
64
65
66
67pub fn progress_bar(length: u64, message: String) -> indicatif::ProgressBar {
68    // Progress bar
69    let progress_style = indicatif::ProgressStyle::with_template(
70        "{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} ({eta}) {msg}",
71    )
72    .unwrap()
73    .progress_chars("##-");
74
75    let progress_bar = indicatif::ProgressBar::new(length as u64);
76    progress_bar.set_style(progress_style.clone());
77    progress_bar.set_message(message);
78    progress_bar
79}
80
81pub struct CellBarcodes {
82    barcodes: HashSet<String>,
83}
84
85impl CellBarcodes {
86    pub fn new() -> Self {
87        Self {
88            barcodes: HashSet::default(),
89        }
90    }
91
92    pub fn barcodes(&self) -> HashSet<String> {
93        self.barcodes.clone()
94    }
95
96    pub fn is_empty(&self) -> bool {
97        self.barcodes.is_empty()
98    }
99
100    pub fn from_csv(file_path: &PathBuf) -> Result<Self> {
101        let path = Path::new(file_path).to_path_buf();
102
103        let df = CsvReadOptions::default()
104            .with_has_header(true)
105            .try_into_reader_with_file_path(Some(path))?
106            .finish()?;
107        let mut barcodes = HashSet::default();
108
109        for barcode in df.column("barcode").unwrap().str().unwrap() {
110            let barcode = barcode.unwrap().to_string();
111            barcodes.insert(barcode);
112        }
113
114        println!("Number of barcodes: {}", barcodes.len());
115        println!(
116            "First 10 barcodes: {:?}",
117            barcodes.iter().take(10).collect::<Vec<&String>>()
118        );
119
120        Ok(Self { barcodes })
121    }
122}
123
124pub struct CellBarcodesMulti {
125    barcodes: Vec<HashSet<String>>,
126}
127
128impl CellBarcodesMulti {
129    pub fn new() -> Self {
130        Self {
131            barcodes: Vec::new(),
132        }
133    }
134
135    pub fn barcodes(&self) -> Vec<HashSet<String>> {
136        self.barcodes.clone()
137    }
138
139    pub fn is_empty(&self) -> bool {
140        self.barcodes.is_empty()
141    }
142
143    pub fn from_csv(file_path: &PathBuf) -> Result<Self> {
144        let path = Path::new(file_path).to_path_buf();
145
146        let df = CsvReadOptions::default()
147            .with_has_header(true)
148            .try_into_reader_with_file_path(Some(path))?
149            .finish()?;
150
151        let mut barcodes = Vec::new();
152
153        for (i, column) in df.iter().enumerate() {
154            let barcode = column.str().unwrap();
155
156            let bc = barcode
157                .iter()
158                .filter_map(|x| x.map(|x| x.to_string()))
159                .collect::<HashSet<String>>();
160
161            barcodes.push(bc);
162        }
163
164        Ok(Self { barcodes })
165    }
166}
167
168#[derive(Debug)]
169struct ChromosomeStats {
170    chrom: String,
171    length: u64,
172    mapped: u64,
173    unmapped: u64,
174}
175
176
177pub fn bam_header(file_path: PathBuf) -> Result<sam::Header> {
178    // Read the header of a BAM file
179    // If the noodles crate fails to read the header, we fall back to samtools
180    // This is a bit of a hack but it works for now
181    // The noodles crate is more strict about the header format than samtools
182
183    // Check that the file exists
184    if !file_path.exists() {
185        return Err(anyhow::Error::from(std::io::Error::new(
186            std::io::ErrorKind::NotFound,
187            format!("File not found: {}", file_path.display()),
188        )));
189    };
190
191    let mut reader = bam::io::indexed_reader::Builder::default()
192        .build_from_path(file_path.clone())
193        .expect("Failed to open file");
194
195    let header = match reader.read_header() {
196        std::result::Result::Ok(header) => header,
197        Err(e) => {
198            debug!(
199                "Failed to read header using noodels falling back to samtools: {}",
200                e
201            );
202
203            let header_samtools = std::process::Command::new("samtools")
204                .arg("view")
205                .arg("-H")
206                .arg(file_path.clone())
207                .output()
208                .expect("Failed to run samtools")
209                .stdout;
210
211            let header_str =
212                String::from_utf8(header_samtools).expect("Failed to convert header to string");
213
214            // Slight hack here for CellRanger BAM files that are missing the version info
215            let header_string =
216                header_str.replace("@HD\tSO:coordinate\n", "@HD\tVN:1.6\tSO:coordinate\n");
217            let header_str = header_string.as_bytes();
218            let mut reader = sam::io::Reader::new(header_str);
219            let header = reader
220                .read_header()
221                .expect("Failed to read header with samtools");
222            header
223        }
224    };
225    Ok(header)
226}
227
228pub struct BamStats {
229    // BAM file stats
230
231    // File path
232    file_path: PathBuf,
233
234    // Header
235    header: sam::Header,
236
237    // Contigs present in the BAM file
238    contigs: Vec<Contig<String, ReqStrand>>,
239
240    // Chromosome stats
241    chrom_stats: HashMap<String, ChromosomeStats>,
242
243    // Total number of reads
244    n_reads: u64,
245    // Number of reads mapped to the genome
246    n_mapped: u64,
247    // Number of reads unmapped to the genome
248    n_unmapped: u64,
249}
250
251impl BamStats {
252    pub fn new(file_path: PathBuf) -> Result<Self> {
253        // Read the header of the BAM file
254        let header = bam_header(file_path.clone())?;
255
256        // Get the contigs from the header
257        let contigs = header
258            .reference_sequences()
259            .iter()
260            .map(|(name, map)| {
261                let name = name.to_string();
262                let length = map.length().get();
263                let contig = Contig::new(name, 1, length, ReqStrand::Forward);
264                contig
265            })
266         .collect();
267
268        // Get the index of the BAM file
269        let bam_reader = bam::io::indexed_reader::Builder::default()
270            .build_from_path(file_path.clone())
271            .expect("Failed to open file");
272        let index = bam_reader.index();
273
274        // Get the chromosome stats
275        let mut chrom_stats = HashMap::default();
276
277        for ((reference_sequence_name_buf, reference_sequence), index_reference_sequence) in header
278            .reference_sequences()
279            .iter()
280            .zip(index.reference_sequences())
281        {
282            let reference_sequence_name = std::str::from_utf8(reference_sequence_name_buf)
283                .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidInput, e))?;
284
285            let (mapped_record_count, unmapped_record_count) = index_reference_sequence
286                .metadata()
287                .map(|m| (m.mapped_record_count(), m.unmapped_record_count()))
288                .unwrap_or_default();
289
290            let stats = ChromosomeStats {
291                chrom: reference_sequence_name.to_string(),
292                length: usize::from(reference_sequence.length()) as u64,
293                mapped: mapped_record_count,
294                unmapped: unmapped_record_count,
295            };
296
297            chrom_stats.insert(reference_sequence_name.to_string(), stats);
298        }
299
300        let mut unmapped_record_count = index.unplaced_unmapped_record_count().unwrap_or_default();
301
302        let mut mapped_record_count = 0;
303
304        for (_, stats) in chrom_stats.iter() {
305            mapped_record_count += stats.mapped;
306            unmapped_record_count += stats.unmapped;
307        }
308
309        let n_reads = mapped_record_count + unmapped_record_count;
310
311        Ok(Self {
312            file_path,
313            header,
314            contigs,
315            chrom_stats,
316            n_reads,
317            n_mapped: mapped_record_count,
318            n_unmapped: unmapped_record_count,
319        })
320    }
321
322    pub fn estimate_genome_chunk_length(&self, bin_size: u64) -> Result<u64> {
323        // genomeLength = sum(bamHandles[0].lengths)
324        // max_reads_per_bp = max([float(x) / genomeLength for x in mappedList])
325        // genomeChunkLength = int(min(5e6, int(2e6 / (max_reads_per_bp * len(bamHandles)))))
326        // genomeChunkLength -= genomeChunkLength % tile_size
327
328        let stats = &self.chrom_stats;
329        let genome_length = stats.values().map(|x| x.length).sum::<u64>();
330        let max_reads_per_bp = self.n_mapped as f64 / genome_length as f64;
331        let genome_chunk_length = f64::from(5e6).min(2e6 / (max_reads_per_bp));
332
333        let correction = genome_chunk_length % bin_size as f64;
334        let genome_chunk_length = genome_chunk_length - correction;
335
336        Ok(genome_chunk_length as u64)
337    }
338
339    pub fn genome_chunks(&self, bin_size: u64) -> Result<Vec<Region>> {
340        let genome_chunk_length = self.estimate_genome_chunk_length(bin_size)?;
341        let chrom_chunks = self
342            .chrom_stats
343            .iter()
344            .map(|(chrom, stats)| {
345                let mut chunks = Vec::new();
346                let mut chunk_start = 1;
347                let chrom_end = stats.length;
348
349                while chunk_start <= chrom_end {
350                    // Corrected to include the last position in the range
351                    let chunk_end = chunk_start + genome_chunk_length - 1; // Adjust to ensure the chunk covers exactly genome_chunk_length positions
352                    let chunk_end = chunk_end.min(chrom_end); // Ensure we do not exceed the chromosome length
353
354                    let start = Position::try_from(chunk_start as usize).unwrap();
355                    let end = Position::try_from(chunk_end as usize).unwrap();
356
357                    let region = Region::new(&*chrom.clone(), start..=end);
358                    chunks.push(region);
359                    chunk_start = chunk_end + 1; // Corrected to start the next chunk right after the current chunk ends
360                }
361                chunks
362            })
363            .flatten()
364            .collect();
365
366        Ok(chrom_chunks)
367    }
368
369    pub fn chromosome_id_to_chromosome_name_mapping(&self) -> HashMap<usize, String> {
370        let mut ref_id_mapping = HashMap::default();
371        for (i, (name, _)) in self.header.reference_sequences().iter().enumerate() {
372            let name = std::str::from_utf8(name)
373                .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidInput, e))
374                .unwrap()
375                .to_string();
376            ref_id_mapping.insert(i, name);
377        }
378        ref_id_mapping
379    }
380
381    pub fn chromosome_name_to_id_mapping(&self) -> Result<HashMap<String, usize>> {
382        let mut mapping = HashMap::default();
383        let id_to_name = self.chromosome_id_to_chromosome_name_mapping();
384
385        // Invert the mapping
386        for (id, name) in id_to_name.iter() {
387            mapping.insert(name.clone(), *id);
388        }
389
390        Ok(mapping)
391    }
392
393    pub fn chromsizes_ref_id(&self) -> Result<HashMap<usize, u64>> {
394        let mut chromsizes = HashMap::default();
395        for (i, (_, map)) in self.header.reference_sequences().iter().enumerate() {
396            let length = map.length().get();
397            chromsizes.insert(i, length as u64);
398        }
399        Ok(chromsizes)
400    }
401
402    pub fn chromsizes_ref_name(&self) -> Result<HashMap<String, u64>> {
403        let mut chromsizes = HashMap::default();
404        for (name, map) in self.header.reference_sequences() {
405            let length = map.length().get();
406            let name = std::str::from_utf8(name)
407                .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidInput, e))
408                .unwrap()
409                .to_string();
410            chromsizes.insert(name, length as u64);
411        }
412        Ok(chromsizes)
413    }
414
415    pub fn write_chromsizes(&self, outfile: PathBuf) -> Result<()> {
416        // // Write the chromosome sizes to a file
417        // let chrom_sizes = self.chromsizes_ref_name()?;
418        // // Write to the output file
419        // info!("Writing chromosome sizes to {}", outfile.display());
420        // let mut writer = std::io::BufWriter::new(std::fs::File::create(outfile)?);
421
422        // for (chrom, length) in chrom_sizes.iter() {
423        //     writeln!(writer, "{}\t{}", chrom, length)?;
424        // }
425        // Ok(())
426
427
428        let chrom_sizes = self.chromsizes_ref_name()?;
429        // Convert the chromosome sizes to a DataFrame
430        let mut df = DataFrame::new(vec![
431            Column::new("chrom".into(), chrom_sizes.keys().cloned().collect::<Vec<String>>()),
432            Column::new("length".into(), chrom_sizes.values().cloned().collect::<Vec<u64>>()),
433        ])?;
434
435        // Sort the DataFrame by chromosome name
436        df.sort_in_place(["chrom"], SortMultipleOptions::default())?;
437
438        // Write the DataFrame to a CSV file
439        info!("Writing chromosome sizes to {}", outfile.display());
440        let mut file = std::fs::File::create(outfile)?;
441        CsvWriter::new(&mut file)
442            .include_header(false)
443            .with_separator(b'\t')
444            .finish(&mut df)?;
445        Ok(())
446
447
448    }
449
450    pub fn n_mapped(&self) -> u64 {
451        self.n_mapped
452    }
453    pub fn n_unmapped(&self) -> u64 {
454        self.n_unmapped
455    }
456    pub fn n_total_reads(&self) -> u64 {
457        self.n_reads
458    }
459}
460
461pub enum FileType {
462    Bedgraph,
463    Bigwig,
464    TSV,
465}
466
467impl FileType {
468    pub fn from_str(s: &str) -> Result<FileType, String> {
469        match s.to_lowercase().as_str() {
470            "bedgraph" => Ok(FileType::Bedgraph),
471            "bdg" => Ok(FileType::Bedgraph),
472            "bigwig" => Ok(FileType::Bigwig),
473            "bw" => Ok(FileType::Bigwig),
474            "tsv" => Ok(FileType::TSV),
475            "txt" => Ok(FileType::TSV),
476            _ => Err(format!("Unknown file type: {}", s)),
477        }
478    }
479}
480
481
482
483pub fn regions_to_lapper(regions: Vec<Region>) -> Result<HashMap<String, Lapper<usize, u32>>> {
484    let mut lapper: HashMap<String, Lapper<usize, u32>> = HashMap::default();
485    let mut intervals: HashMap<String, Vec<Iv>> = HashMap::default();
486
487    for reg in regions {
488        let chrom = reg.name().to_string();
489        let start = reg.start().map(|x| x.get());
490        let end = reg.end().map(|x| x.get());
491
492        let start = match start {
493            Bound::Included(start) => start,
494            Bound::Excluded(start) => start + 1,
495            _ => 0,
496        };
497
498        let end = match end {
499            Bound::Included(end) => end,
500            Bound::Excluded(end) => end - 1,
501            _ => 0,
502        };
503
504        let iv = Iv {
505            start: start.into(),
506            stop: end.into(),
507            val: 0,
508        };
509        intervals
510            .entry(chrom.clone())
511            .or_insert(Vec::new())
512            .push(iv);
513    }
514
515    for (chrom, ivs) in intervals.iter() {
516        let lap = Lapper::new(ivs.to_vec());
517        lapper.insert(chrom.clone(), lap);
518    }
519
520    Ok(lapper)
521}
522
523
524pub fn bed_to_lapper(bed: PathBuf) -> Result<HashMap<String, Lapper<usize, u32>>> {
525
526    // Read the bed file and convert it to a lapper
527    let reader = std::fs::File::open(bed)?;
528    let mut buf_reader = std::io::BufReader::new(reader);
529    let mut bed_reader = bed::io::Reader::<4, _>::new(buf_reader);
530    let mut record = bed::Record::default();
531    let mut intervals: HashMap<String, Vec<Iv>> = HashMap::default();
532    let mut lapper: HashMap<String, Lapper<usize, u32>> = HashMap::default();
533
534
535    while bed_reader.read_record(&mut record)? != 0 {
536        let chrom = record.reference_sequence_name().to_string();
537        let start = record.feature_start()?;
538        let end = record.feature_end().context("Failed to get feature end")??;
539
540        let iv = Iv {
541            start: start.into(),
542            stop: end.into(),
543            val: 0,
544        };
545        intervals
546            .entry(chrom.clone())
547            .or_insert(Vec::new())
548            .push(iv);
549    }
550
551    for (chrom, ivs) in intervals.iter() {
552        let lap = Lapper::new(ivs.to_vec());
553        lapper.insert(chrom.clone(), lap);
554    }
555    // Check if the lapper is empty
556    if lapper.is_empty() {
557        return Err(anyhow::Error::msg("Lapper is empty"));
558    }
559    
560
561    Ok(lapper)
562}
563
564
565/// Convert the chromosome names in the lapper to the chromosome IDs in the BAM file
566/// This is useful for when we want to use the lapper with the BAM file
567/// We need to convert the chromosome names in the lapper to the chromosome IDs in the BAM file
568pub fn lapper_chrom_name_to_lapper_chrom_id(
569    lapper: HashMap<String, Lapper<usize, u32>>,
570    bam_stats: &BamStats,
571) -> Result<HashMap<usize, Lapper<usize, u32>>> {
572
573    // Convert the chromosome names in the lapper to the chromosome IDs in the BAM file
574    let chrom_id_mapping = bam_stats.chromosome_name_to_id_mapping()?;
575    let mut lapper_chrom_id: HashMap<usize, Lapper<usize, u32>> = HashMap::default();
576    for (chrom, lap) in lapper.iter() {
577        let chrom_id = chrom_id_mapping
578            .get(chrom)
579            .ok_or_else(|| anyhow::Error::msg(format!("Chromosome {} not found in BAM file", chrom)))?;
580        lapper_chrom_id.insert(*chrom_id, lap.clone());
581    }
582    Ok(lapper_chrom_id)
583}
584
585
586
587
588
589/// Get the header of a BAM file
590/// If the noodles crate fails to read the header, we fall back to samtools
591/// This is a bit of a hack but it works for now
592/// The noodles crate is more strict about the header format than samtools
593pub fn get_bam_header<P>(file_path: P) -> Result<sam::Header>
594where
595    P: AsRef<Path>,
596{
597    let file_path = file_path.as_ref();
598
599    // Check that the file exists
600    if !file_path.exists() {
601        return Err(anyhow::Error::from(std::io::Error::new(
602            std::io::ErrorKind::NotFound,
603            format!("File not found: {}", file_path.display()),
604        )));
605    };
606
607    let mut reader = bam::io::indexed_reader::Builder::default()
608        .build_from_path(file_path)
609        .expect("Failed to open file");
610
611    let header = match reader.read_header() {
612        std::result::Result::Ok(header) => header,
613        Err(e) => {
614            debug!(
615                "Failed to read header using noodels falling back to samtools: {}",
616                e
617            );
618
619            let header_samtools = std::process::Command::new("samtools")
620                .arg("view")
621                .arg("-H")
622                .arg(file_path)
623                .output()
624                .expect("Failed to run samtools")
625                .stdout;
626
627            let header_str =
628                String::from_utf8(header_samtools).expect("Failed to convert header to string");
629
630            // Slight hack here for CellRanger BAM files that are missing the version info
631            let header_string =
632                header_str.replace("@HD\tSO:coordinate\n", "@HD\tVN:1.6\tSO:coordinate\n");
633            let header_str = header_string.as_bytes();
634            let mut reader = sam::io::Reader::new(header_str);
635            let header = reader
636                .read_header()
637                .expect("Failed to read header with samtools");
638
639            header
640        }
641    };
642    Ok(header)
643}