use ahash::{HashMap, HashSet};
use anyhow::{Context, Result};
use bio_types::annot::contig::Contig;
use bio_types::strand::ReqStrand;
use log::debug;
use bio_types::strand::Strand as BioStrand;
use noodles::core::{Position, Region};
use noodles::sam::header::record::value::{Map, map::Program, map::program::tag as pg_tag};
use noodles::{bam, bed, sam};
use polars::prelude::*;
use rust_lapper::Interval;
use rust_lapper::Lapper;
use std::ops::Bound;
use std::path::{Path, PathBuf};
use std::str::FromStr;
pub const CB: [u8; 2] = [b'C', b'B'];
pub type Iv = Interval<usize, u32>;
const BAMNADO_PROGRAM_ID: &str = "bamnado";
const BAMNADO_PROGRAM_NAME: &str = "bamnado";
const BAMNADO_VERSION: &str = env!("CARGO_PKG_VERSION");
#[derive(Debug, Clone, PartialEq, Eq, Copy)]
pub enum Strand {
Forward,
Reverse,
Both,
}
impl FromStr for Strand {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"forward" => Ok(Strand::Forward),
"reverse" => Ok(Strand::Reverse),
"both" => Ok(Strand::Both),
_ => Err(format!("Invalid strand value: {s}")),
}
}
}
impl std::fmt::Display for Strand {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Strand::Forward => write!(f, "forward"),
Strand::Reverse => write!(f, "reverse"),
Strand::Both => write!(f, "both"),
}
}
}
impl From<Strand> for BioStrand {
fn from(val: Strand) -> Self {
match val {
Strand::Forward => BioStrand::Forward,
Strand::Reverse => BioStrand::Reverse,
Strand::Both => BioStrand::Unknown,
}
}
}
pub fn progress_bar(length: u64, message: String) -> indicatif::ProgressBar {
let progress_style = indicatif::ProgressStyle::with_template(
"{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} ({eta}) {msg}",
)
.unwrap()
.progress_chars("##-");
indicatif::ProgressBar::new(length)
.with_style(progress_style)
.with_message(message)
}
pub struct CellBarcodes {
barcodes: HashSet<String>,
}
impl Default for CellBarcodes {
fn default() -> Self {
Self::new()
}
}
impl CellBarcodes {
pub fn new() -> Self {
Self {
barcodes: HashSet::default(),
}
}
pub fn barcodes(&self) -> HashSet<String> {
self.barcodes.clone()
}
pub fn is_empty(&self) -> bool {
self.barcodes.is_empty()
}
pub fn from_csv(file_path: &PathBuf) -> Result<Self> {
let path = Path::new(file_path).to_path_buf();
let df = CsvReadOptions::default()
.with_has_header(true)
.try_into_reader_with_file_path(Some(path))?
.finish()?;
let mut barcodes = HashSet::default();
for barcode in df.column("barcode").unwrap().str().unwrap() {
let barcode = barcode.unwrap().to_string();
barcodes.insert(barcode);
}
println!("Number of barcodes: {}", barcodes.len());
println!(
"First 10 barcodes: {:?}",
barcodes.iter().take(10).collect::<Vec<_>>()
);
Ok(Self { barcodes })
}
}
pub struct CellBarcodesMulti {
barcodes: Vec<HashSet<String>>,
}
impl Default for CellBarcodesMulti {
fn default() -> Self {
Self::new()
}
}
impl CellBarcodesMulti {
pub fn new() -> Self {
Self {
barcodes: Vec::new(),
}
}
pub fn barcodes(&self) -> Vec<HashSet<String>> {
self.barcodes.clone()
}
pub fn is_empty(&self) -> bool {
self.barcodes.is_empty()
}
pub fn from_csv(file_path: &PathBuf) -> Result<Self> {
let path = Path::new(file_path).to_path_buf();
let df = CsvReadOptions::default()
.with_has_header(true)
.try_into_reader_with_file_path(Some(path))?
.finish()?;
let mut barcodes = Vec::new();
for column in df.iter() {
let barcode = column.str().unwrap();
let bc = barcode
.iter()
.filter_map(|x| x.map(|x| x.to_string()))
.collect::<HashSet<String>>();
barcodes.push(bc);
}
Ok(Self { barcodes })
}
}
#[derive(Debug)]
#[allow(dead_code)]
struct ChromosomeStats {
chrom: String,
length: u64,
mapped: u64,
unmapped: u64,
}
pub fn bam_header(file_path: PathBuf) -> Result<sam::Header> {
if !file_path.exists() {
return Err(anyhow::Error::from(std::io::Error::new(
std::io::ErrorKind::NotFound,
format!("File not found: {}", file_path.display()),
)));
};
let mut reader = bam::io::indexed_reader::Builder::default()
.build_from_path(file_path.clone())
.expect("Failed to open file");
let header = match reader.read_header() {
std::result::Result::Ok(header) => header,
Err(e) => {
debug!("Failed to read header using noodels falling back to samtools: {e}");
let header_samtools = std::process::Command::new("samtools")
.arg("view")
.arg("-H")
.arg(file_path.clone())
.output()
.expect("Failed to run samtools")
.stdout;
let header_str =
String::from_utf8(header_samtools).expect("Failed to convert header to string");
let header_string =
header_str.replace("@HD\tSO:coordinate\n", "@HD\tVN:1.6\tSO:coordinate\n");
let header_str = header_string.as_bytes();
let mut reader = sam::io::Reader::new(header_str);
reader
.read_header()
.expect("Failed to read header with samtools")
}
};
Ok(header)
}
pub struct BamStats {
#[allow(dead_code)]
file_path: PathBuf,
header: sam::Header,
#[allow(dead_code)]
contigs: Vec<Contig<String, ReqStrand>>,
chrom_stats: HashMap<String, ChromosomeStats>,
n_reads: u64,
n_mapped: u64,
n_unmapped: u64,
}
impl BamStats {
pub fn new(file_path: PathBuf) -> Result<Self> {
let header = bam_header(file_path.clone())?;
let contigs = header
.reference_sequences()
.iter()
.map(|(name, map)| {
let name = name.to_string();
let length = map.length().get();
Contig::new(name, 1, length, ReqStrand::Forward)
})
.collect();
let bam_reader = bam::io::indexed_reader::Builder::default()
.build_from_path(file_path.clone())
.expect("Failed to open file");
let index = bam_reader.index();
let mut chrom_stats = HashMap::default();
for ((reference_sequence_name_buf, reference_sequence), index_reference_sequence) in header
.reference_sequences()
.iter()
.zip(index.reference_sequences())
{
let reference_sequence_name = std::str::from_utf8(reference_sequence_name_buf)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidInput, e))?;
let (mapped_record_count, unmapped_record_count) = index_reference_sequence
.metadata()
.map(|m| (m.mapped_record_count(), m.unmapped_record_count()))
.unwrap_or_default();
let stats = ChromosomeStats {
chrom: reference_sequence_name.to_string(),
length: usize::from(reference_sequence.length()) as u64,
mapped: mapped_record_count,
unmapped: unmapped_record_count,
};
chrom_stats.insert(reference_sequence_name.to_string(), stats);
}
let mut unmapped_record_count = index.unplaced_unmapped_record_count().unwrap_or_default();
let mut mapped_record_count = 0;
for (_, stats) in chrom_stats.iter() {
mapped_record_count += stats.mapped;
unmapped_record_count += stats.unmapped;
}
let n_reads = mapped_record_count + unmapped_record_count;
Ok(Self {
file_path,
header,
contigs,
chrom_stats,
n_reads,
n_mapped: mapped_record_count,
n_unmapped: unmapped_record_count,
})
}
pub fn estimate_genome_chunk_length(&self, bin_size: u64) -> Result<u64> {
let stats = &self.chrom_stats;
let genome_length = stats.values().map(|x| x.length).sum::<u64>();
let max_reads_per_bp = self.n_mapped as f64 / genome_length as f64;
let genome_chunk_length = 5e6_f64.min(2e6 / (max_reads_per_bp));
let correction = genome_chunk_length % bin_size as f64;
let genome_chunk_length = genome_chunk_length - correction;
Ok(genome_chunk_length as u64)
}
pub fn genome_chunks(&self, bin_size: u64) -> Result<Vec<Region>> {
let genome_chunk_length = self.estimate_genome_chunk_length(bin_size)?;
let chrom_chunks = self
.chrom_stats
.iter()
.flat_map(|(chrom, stats)| {
let mut chunks = Vec::new();
let mut chunk_start = 1;
let chrom_end = stats.length;
while chunk_start <= chrom_end {
let chunk_end = chunk_start + genome_chunk_length - 1; let chunk_end = chunk_end.min(chrom_end);
let start = Position::try_from(chunk_start as usize).unwrap();
let end = Position::try_from(chunk_end as usize).unwrap();
let region = Region::new(&*chrom.clone(), start..=end);
chunks.push(region);
chunk_start = chunk_end + 1; }
chunks
})
.collect();
Ok(chrom_chunks)
}
pub fn chromosome_chunks(&self, chrom: &str, bin_size: u64) -> Result<Vec<Region>> {
let genome_chunk_length = self.estimate_genome_chunk_length(bin_size)?;
let stats = self
.chrom_stats
.get(chrom)
.ok_or_else(|| anyhow::Error::msg(format!("Chromosome {chrom} not found")))?;
let mut chunks = Vec::new();
let mut chunk_start = 1;
let chrom_end = stats.length;
while chunk_start <= chrom_end {
let chunk_end = chunk_start + genome_chunk_length - 1; let chunk_end = chunk_end.min(chrom_end);
let start = Position::try_from(chunk_start as usize).unwrap();
let end = Position::try_from(chunk_end as usize).unwrap();
let region = Region::new(chrom, start..=end);
chunks.push(region);
chunk_start = chunk_end + 1; }
Ok(chunks)
}
pub fn chromosome_id_to_chromosome_name_mapping(&self) -> HashMap<usize, String> {
let mut ref_id_mapping = HashMap::default();
for (i, (name, _)) in self.header.reference_sequences().iter().enumerate() {
let name = std::str::from_utf8(name)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidInput, e))
.unwrap()
.to_string();
ref_id_mapping.insert(i, name);
}
ref_id_mapping
}
pub fn chromosome_name_to_id_mapping(&self) -> Result<HashMap<String, usize>> {
let mut mapping = HashMap::default();
let id_to_name = self.chromosome_id_to_chromosome_name_mapping();
for (id, name) in id_to_name.iter() {
mapping.insert(name.clone(), *id);
}
Ok(mapping)
}
pub fn chromsizes_ref_id(&self) -> Result<HashMap<usize, u64>> {
let mut chromsizes = HashMap::default();
for (i, (_, map)) in self.header.reference_sequences().iter().enumerate() {
let length = map.length().get();
chromsizes.insert(i, length as u64);
}
Ok(chromsizes)
}
pub fn chromsizes_ref_name(&self) -> Result<HashMap<String, u64>> {
let mut chromsizes = HashMap::default();
for (name, map) in self.header.reference_sequences() {
let length = map.length().get();
let name = std::str::from_utf8(name)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidInput, e))
.unwrap()
.to_string();
chromsizes.insert(name, length as u64);
}
Ok(chromsizes)
}
pub fn n_mapped(&self) -> u64 {
self.n_mapped
}
pub fn n_unmapped(&self) -> u64 {
self.n_unmapped
}
pub fn n_total_reads(&self) -> u64 {
self.n_reads
}
pub fn header(&self) -> &sam::Header {
&self.header
}
pub fn is_paired_end(&self) -> Result<bool> {
let mut reader = bam::io::reader::Builder
.build_from_path(&self.file_path)
.context("Failed to open BAM file to check paired-end status")?;
let _header = reader.read_header().context("Failed to read BAM header")?;
for (i, record) in reader.records().enumerate() {
let record = match record {
Ok(r) => r,
Err(_) => break,
};
if record.flags().is_segmented() {
return Ok(true);
}
if i >= 1000 {
break;
}
}
Ok(false)
}
}
pub enum FileType {
Bedgraph,
Bigwig,
TSV,
}
impl std::str::FromStr for FileType {
type Err = String;
fn from_str(s: &str) -> Result<FileType, String> {
match s.to_lowercase().as_str() {
"bedgraph" => Ok(FileType::Bedgraph),
"bdg" => Ok(FileType::Bedgraph),
"bigwig" => Ok(FileType::Bigwig),
"bw" => Ok(FileType::Bigwig),
"tsv" => Ok(FileType::TSV),
"txt" => Ok(FileType::TSV),
_ => Err(format!("Unknown file type: {s}")),
}
}
}
pub fn regions_to_lapper(regions: Vec<Region>) -> Result<HashMap<String, Lapper<usize, u32>>> {
let mut lapper: HashMap<String, Lapper<usize, u32>> = HashMap::default();
let mut intervals: HashMap<String, Vec<Iv>> = HashMap::default();
for reg in regions {
let chrom = reg.name().to_string();
let start = reg.start().map(|x| x.get());
let end = reg.end().map(|x| x.get());
let start = match start {
Bound::Included(start) => start,
Bound::Excluded(start) => start + 1,
_ => 0,
};
let end = match end {
Bound::Included(end) => end,
Bound::Excluded(end) => end - 1,
_ => 0,
};
let iv = Iv {
start,
stop: end,
val: 0,
};
intervals.entry(chrom.clone()).or_default().push(iv);
}
for (chrom, ivs) in intervals.iter() {
let lap = Lapper::new(ivs.to_vec());
lapper.insert(chrom.clone(), lap);
}
Ok(lapper)
}
pub fn bed_to_lapper(bed: PathBuf) -> Result<HashMap<String, Lapper<usize, u32>>> {
let reader = std::fs::File::open(bed)?;
let buf_reader = std::io::BufReader::new(reader);
let mut bed_reader = bed::io::Reader::<4, _>::new(buf_reader);
let mut record = bed::Record::default();
let mut intervals: HashMap<String, Vec<Iv>> = HashMap::default();
let mut lapper: HashMap<String, Lapper<usize, u32>> = HashMap::default();
while bed_reader.read_record(&mut record)? != 0 {
let chrom = record.reference_sequence_name().to_string();
let start = record.feature_start()?;
let end = record
.feature_end()
.context("Failed to get feature end")??;
let iv = Iv {
start: start.get(),
stop: end.get(),
val: 0,
};
intervals.entry(chrom.clone()).or_default().push(iv);
}
for (chrom, ivs) in intervals.iter() {
let lap = Lapper::new(ivs.to_vec());
lapper.insert(chrom.clone(), lap);
}
if lapper.is_empty() {
return Err(anyhow::Error::msg("Lapper is empty"));
}
Ok(lapper)
}
pub fn convert_lapper_chrom_names_to_ids(
lapper: HashMap<String, Lapper<usize, u32>>,
bam_stats: &BamStats,
) -> Result<HashMap<usize, Lapper<usize, u32>>> {
let chrom_id_mapping = bam_stats.chromosome_name_to_id_mapping()?;
let mut lapper_chrom_id: HashMap<usize, Lapper<usize, u32>> = HashMap::default();
for (chrom, lap) in lapper.iter() {
let chrom_id = chrom_id_mapping.get(chrom).ok_or_else(|| {
anyhow::Error::msg(format!("Chromosome {chrom} not found in BAM file"))
})?;
lapper_chrom_id.insert(*chrom_id, lap.clone());
}
Ok(lapper_chrom_id)
}
pub fn get_bam_header<P>(file_path: P) -> Result<sam::Header>
where
P: AsRef<Path>,
{
let file_path = file_path.as_ref();
if !file_path.exists() {
return Err(anyhow::Error::from(std::io::Error::new(
std::io::ErrorKind::NotFound,
format!("File not found: {}", file_path.display()),
)));
};
let mut reader = bam::io::indexed_reader::Builder::default()
.build_from_path(file_path)
.expect("Failed to open file");
let header = match reader.read_header() {
std::result::Result::Ok(header) => header,
Err(e) => {
debug!("Failed to read header using noodels falling back to samtools: {e}");
let header_samtools = std::process::Command::new("samtools")
.arg("view")
.arg("-H")
.arg(file_path)
.output()
.expect("Failed to run samtools")
.stdout;
let header_str =
String::from_utf8(header_samtools).expect("Failed to convert header to string");
let header_string =
header_str.replace("@HD\tSO:coordinate\n", "@HD\tVN:1.6\tSO:coordinate\n");
let header_str = header_string.as_bytes();
let mut reader = sam::io::Reader::new(header_str);
reader
.read_header()
.expect("Failed to read header with samtools")
}
};
Ok(header)
}
pub fn add_bamnado_program_group(header: &sam::Header) -> Result<sam::Header> {
let mut output_header = header.clone();
let mut program = Map::<Program>::default();
program
.other_fields_mut()
.insert(pg_tag::NAME, BAMNADO_PROGRAM_NAME.into());
program
.other_fields_mut()
.insert(pg_tag::VERSION, BAMNADO_VERSION.into());
if let Some(command_line) = bamnado_command_line() {
program
.other_fields_mut()
.insert(pg_tag::COMMAND_LINE, command_line.into());
}
output_header
.programs_mut()
.add(BAMNADO_PROGRAM_ID, program)
.context("Failed to append bamnado @PG header record")?;
Ok(output_header)
}
fn bamnado_command_line() -> Option<String> {
let args = std::env::args_os()
.map(|arg| arg.to_string_lossy().into_owned())
.collect::<Vec<_>>();
(!args.is_empty()).then(|| args.join(" "))
}
#[cfg(test)]
mod tests {
use super::*;
use noodles::sam::header::record::value::map::program::tag as pg_tag;
fn test_bam() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.expect("Failed to get workspace root")
.join("test/data/test.bam")
}
#[test]
fn test_add_bamnado_program_group_appends_pg_record() {
let header = sam::Header::default();
let output_header =
add_bamnado_program_group(&header).expect("Failed to add bamnado program group");
let program = output_header
.programs()
.as_ref()
.get(&b"bamnado"[..])
.expect("Missing bamnado @PG record");
assert_eq!(
program
.other_fields()
.get(&pg_tag::NAME)
.map(|v| v.as_ref()),
Some(&b"bamnado"[..])
);
assert_eq!(
program
.other_fields()
.get(&pg_tag::VERSION)
.map(|v| v.as_ref()),
Some(env!("CARGO_PKG_VERSION").as_bytes())
);
assert!(program.other_fields().contains_key(&pg_tag::COMMAND_LINE));
}
#[test]
fn test_is_paired_end_returns_true_for_paired_bam() {
let stats = BamStats::new(test_bam()).expect("Failed to create BamStats");
assert!(
stats.is_paired_end().expect("is_paired_end failed"),
"test.bam should be detected as paired-end"
);
}
#[test]
fn test_bam_stats_reads_are_nonzero() {
let stats = BamStats::new(test_bam()).expect("Failed to create BamStats");
assert!(stats.n_mapped() > 0, "Expected mapped reads in test.bam");
}
}