1use ahash::{HashMap, HashSet};
2use anyhow::{Context, Result};
3use bio_types::annot::contig::Contig;
4use bio_types::annot::loc::Loc;
5use bio_types::strand::ReqStrand;
6use log::{debug, info, warn};
7
8use noodles::core::{Position, Region};
9use noodles::{bam, sam, bed};
10use noodles::sam::alignment::record::data::field::tag::Tag;
11use polars::prelude::*;
12use rust_lapper::Interval;
13use rust_lapper::Lapper;
14use std::io::{BufRead, Write};
15use std::ops::Bound;
16use std::path::{Path, PathBuf};
17use regex::Regex;
18use std::str::FromStr;
19use bio_types::strand::Strand as BioStrand;
20
21pub const CB: [u8; 2] = [b'C', b'B'];
22pub type Iv = Interval<usize, u32>;
23
24
25
26#[derive(Debug, Clone, PartialEq, Eq, Copy)]
27pub enum Strand {
28 Forward,
29 Reverse,
30 Both,
31}
32impl FromStr for Strand {
33 type Err = String;
34
35 fn from_str(s: &str) -> Result<Self, Self::Err> {
36 match s.to_lowercase().as_str() {
37 "forward" => Ok(Strand::Forward),
38 "reverse" => Ok(Strand::Reverse),
39 "both" => Ok(Strand::Both),
40 _ => Err(format!("Invalid strand value: {}", s)),
41 }
42 }
43}
44impl std::fmt::Display for Strand {
45 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
46 match self {
47 Strand::Forward => write!(f, "forward"),
48 Strand::Reverse => write!(f, "reverse"),
49 Strand::Both => write!(f, "both"),
50 }
51 }
52}
53
54impl Into<BioStrand> for Strand {
55 fn into(self) -> BioStrand {
56 match self {
57 Strand::Forward => BioStrand::Forward,
58 Strand::Reverse => BioStrand::Reverse,
59 Strand::Both => BioStrand::Unknown,
60 }
61 }
62}
63
64
65
66
67pub fn progress_bar(length: u64, message: String) -> indicatif::ProgressBar {
68 let progress_style = indicatif::ProgressStyle::with_template(
70 "{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} ({eta}) {msg}",
71 )
72 .unwrap()
73 .progress_chars("##-");
74
75 let progress_bar = indicatif::ProgressBar::new(length as u64);
76 progress_bar.set_style(progress_style.clone());
77 progress_bar.set_message(message);
78 progress_bar
79}
80
81pub struct CellBarcodes {
82 barcodes: HashSet<String>,
83}
84
85impl CellBarcodes {
86 pub fn new() -> Self {
87 Self {
88 barcodes: HashSet::default(),
89 }
90 }
91
92 pub fn barcodes(&self) -> HashSet<String> {
93 self.barcodes.clone()
94 }
95
96 pub fn is_empty(&self) -> bool {
97 self.barcodes.is_empty()
98 }
99
100 pub fn from_csv(file_path: &PathBuf) -> Result<Self> {
101 let path = Path::new(file_path).to_path_buf();
102
103 let df = CsvReadOptions::default()
104 .with_has_header(true)
105 .try_into_reader_with_file_path(Some(path))?
106 .finish()?;
107 let mut barcodes = HashSet::default();
108
109 for barcode in df.column("barcode").unwrap().str().unwrap() {
110 let barcode = barcode.unwrap().to_string();
111 barcodes.insert(barcode);
112 }
113
114 println!("Number of barcodes: {}", barcodes.len());
115 println!(
116 "First 10 barcodes: {:?}",
117 barcodes.iter().take(10).collect::<Vec<&String>>()
118 );
119
120 Ok(Self { barcodes })
121 }
122}
123
124pub struct CellBarcodesMulti {
125 barcodes: Vec<HashSet<String>>,
126}
127
128impl CellBarcodesMulti {
129 pub fn new() -> Self {
130 Self {
131 barcodes: Vec::new(),
132 }
133 }
134
135 pub fn barcodes(&self) -> Vec<HashSet<String>> {
136 self.barcodes.clone()
137 }
138
139 pub fn is_empty(&self) -> bool {
140 self.barcodes.is_empty()
141 }
142
143 pub fn from_csv(file_path: &PathBuf) -> Result<Self> {
144 let path = Path::new(file_path).to_path_buf();
145
146 let df = CsvReadOptions::default()
147 .with_has_header(true)
148 .try_into_reader_with_file_path(Some(path))?
149 .finish()?;
150
151 let mut barcodes = Vec::new();
152
153 for (i, column) in df.iter().enumerate() {
154 let barcode = column.str().unwrap();
155
156 let bc = barcode
157 .iter()
158 .filter_map(|x| x.map(|x| x.to_string()))
159 .collect::<HashSet<String>>();
160
161 barcodes.push(bc);
162 }
163
164 Ok(Self { barcodes })
165 }
166}
167
168#[derive(Debug)]
169struct ChromosomeStats {
170 chrom: String,
171 length: u64,
172 mapped: u64,
173 unmapped: u64,
174}
175
176
177pub fn bam_header(file_path: PathBuf) -> Result<sam::Header> {
178 if !file_path.exists() {
185 return Err(anyhow::Error::from(std::io::Error::new(
186 std::io::ErrorKind::NotFound,
187 format!("File not found: {}", file_path.display()),
188 )));
189 };
190
191 let mut reader = bam::io::indexed_reader::Builder::default()
192 .build_from_path(file_path.clone())
193 .expect("Failed to open file");
194
195 let header = match reader.read_header() {
196 std::result::Result::Ok(header) => header,
197 Err(e) => {
198 debug!(
199 "Failed to read header using noodels falling back to samtools: {}",
200 e
201 );
202
203 let header_samtools = std::process::Command::new("samtools")
204 .arg("view")
205 .arg("-H")
206 .arg(file_path.clone())
207 .output()
208 .expect("Failed to run samtools")
209 .stdout;
210
211 let header_str =
212 String::from_utf8(header_samtools).expect("Failed to convert header to string");
213
214 let header_string =
216 header_str.replace("@HD\tSO:coordinate\n", "@HD\tVN:1.6\tSO:coordinate\n");
217 let header_str = header_string.as_bytes();
218 let mut reader = sam::io::Reader::new(header_str);
219 let header = reader
220 .read_header()
221 .expect("Failed to read header with samtools");
222 header
223 }
224 };
225 Ok(header)
226}
227
228pub struct BamStats {
229 file_path: PathBuf,
233
234 header: sam::Header,
236
237 contigs: Vec<Contig<String, ReqStrand>>,
239
240 chrom_stats: HashMap<String, ChromosomeStats>,
242
243 n_reads: u64,
245 n_mapped: u64,
247 n_unmapped: u64,
249}
250
251impl BamStats {
252 pub fn new(file_path: PathBuf) -> Result<Self> {
253 let header = bam_header(file_path.clone())?;
255
256 let contigs = header
258 .reference_sequences()
259 .iter()
260 .map(|(name, map)| {
261 let name = name.to_string();
262 let length = map.length().get();
263 let contig = Contig::new(name, 1, length, ReqStrand::Forward);
264 contig
265 })
266 .collect();
267
268 let bam_reader = bam::io::indexed_reader::Builder::default()
270 .build_from_path(file_path.clone())
271 .expect("Failed to open file");
272 let index = bam_reader.index();
273
274 let mut chrom_stats = HashMap::default();
276
277 for ((reference_sequence_name_buf, reference_sequence), index_reference_sequence) in header
278 .reference_sequences()
279 .iter()
280 .zip(index.reference_sequences())
281 {
282 let reference_sequence_name = std::str::from_utf8(reference_sequence_name_buf)
283 .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidInput, e))?;
284
285 let (mapped_record_count, unmapped_record_count) = index_reference_sequence
286 .metadata()
287 .map(|m| (m.mapped_record_count(), m.unmapped_record_count()))
288 .unwrap_or_default();
289
290 let stats = ChromosomeStats {
291 chrom: reference_sequence_name.to_string(),
292 length: usize::from(reference_sequence.length()) as u64,
293 mapped: mapped_record_count,
294 unmapped: unmapped_record_count,
295 };
296
297 chrom_stats.insert(reference_sequence_name.to_string(), stats);
298 }
299
300 let mut unmapped_record_count = index.unplaced_unmapped_record_count().unwrap_or_default();
301
302 let mut mapped_record_count = 0;
303
304 for (_, stats) in chrom_stats.iter() {
305 mapped_record_count += stats.mapped;
306 unmapped_record_count += stats.unmapped;
307 }
308
309 let n_reads = mapped_record_count + unmapped_record_count;
310
311 Ok(Self {
312 file_path,
313 header,
314 contigs,
315 chrom_stats,
316 n_reads,
317 n_mapped: mapped_record_count,
318 n_unmapped: unmapped_record_count,
319 })
320 }
321
322 pub fn estimate_genome_chunk_length(&self, bin_size: u64) -> Result<u64> {
323 let stats = &self.chrom_stats;
329 let genome_length = stats.values().map(|x| x.length).sum::<u64>();
330 let max_reads_per_bp = self.n_mapped as f64 / genome_length as f64;
331 let genome_chunk_length = f64::from(5e6).min(2e6 / (max_reads_per_bp));
332
333 let correction = genome_chunk_length % bin_size as f64;
334 let genome_chunk_length = genome_chunk_length - correction;
335
336 Ok(genome_chunk_length as u64)
337 }
338
339 pub fn genome_chunks(&self, bin_size: u64) -> Result<Vec<Region>> {
340 let genome_chunk_length = self.estimate_genome_chunk_length(bin_size)?;
341 let chrom_chunks = self
342 .chrom_stats
343 .iter()
344 .map(|(chrom, stats)| {
345 let mut chunks = Vec::new();
346 let mut chunk_start = 1;
347 let chrom_end = stats.length;
348
349 while chunk_start <= chrom_end {
350 let chunk_end = chunk_start + genome_chunk_length - 1; let chunk_end = chunk_end.min(chrom_end); let start = Position::try_from(chunk_start as usize).unwrap();
355 let end = Position::try_from(chunk_end as usize).unwrap();
356
357 let region = Region::new(&*chrom.clone(), start..=end);
358 chunks.push(region);
359 chunk_start = chunk_end + 1; }
361 chunks
362 })
363 .flatten()
364 .collect();
365
366 Ok(chrom_chunks)
367 }
368
369 pub fn chromosome_id_to_chromosome_name_mapping(&self) -> HashMap<usize, String> {
370 let mut ref_id_mapping = HashMap::default();
371 for (i, (name, _)) in self.header.reference_sequences().iter().enumerate() {
372 let name = std::str::from_utf8(name)
373 .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidInput, e))
374 .unwrap()
375 .to_string();
376 ref_id_mapping.insert(i, name);
377 }
378 ref_id_mapping
379 }
380
381 pub fn chromosome_name_to_id_mapping(&self) -> Result<HashMap<String, usize>> {
382 let mut mapping = HashMap::default();
383 let id_to_name = self.chromosome_id_to_chromosome_name_mapping();
384
385 for (id, name) in id_to_name.iter() {
387 mapping.insert(name.clone(), *id);
388 }
389
390 Ok(mapping)
391 }
392
393 pub fn chromsizes_ref_id(&self) -> Result<HashMap<usize, u64>> {
394 let mut chromsizes = HashMap::default();
395 for (i, (_, map)) in self.header.reference_sequences().iter().enumerate() {
396 let length = map.length().get();
397 chromsizes.insert(i, length as u64);
398 }
399 Ok(chromsizes)
400 }
401
402 pub fn chromsizes_ref_name(&self) -> Result<HashMap<String, u64>> {
403 let mut chromsizes = HashMap::default();
404 for (name, map) in self.header.reference_sequences() {
405 let length = map.length().get();
406 let name = std::str::from_utf8(name)
407 .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidInput, e))
408 .unwrap()
409 .to_string();
410 chromsizes.insert(name, length as u64);
411 }
412 Ok(chromsizes)
413 }
414
415 pub fn write_chromsizes(&self, outfile: PathBuf) -> Result<()> {
416 let chrom_sizes = self.chromsizes_ref_name()?;
429 let mut df = DataFrame::new(vec![
431 Column::new("chrom".into(), chrom_sizes.keys().cloned().collect::<Vec<String>>()),
432 Column::new("length".into(), chrom_sizes.values().cloned().collect::<Vec<u64>>()),
433 ])?;
434
435 df.sort_in_place(["chrom"], SortMultipleOptions::default())?;
437
438 info!("Writing chromosome sizes to {}", outfile.display());
440 let mut file = std::fs::File::create(outfile)?;
441 CsvWriter::new(&mut file)
442 .include_header(false)
443 .with_separator(b'\t')
444 .finish(&mut df)?;
445 Ok(())
446
447
448 }
449
450 pub fn n_mapped(&self) -> u64 {
451 self.n_mapped
452 }
453 pub fn n_unmapped(&self) -> u64 {
454 self.n_unmapped
455 }
456 pub fn n_total_reads(&self) -> u64 {
457 self.n_reads
458 }
459}
460
461pub enum FileType {
462 Bedgraph,
463 Bigwig,
464 TSV,
465}
466
467impl FileType {
468 pub fn from_str(s: &str) -> Result<FileType, String> {
469 match s.to_lowercase().as_str() {
470 "bedgraph" => Ok(FileType::Bedgraph),
471 "bdg" => Ok(FileType::Bedgraph),
472 "bigwig" => Ok(FileType::Bigwig),
473 "bw" => Ok(FileType::Bigwig),
474 "tsv" => Ok(FileType::TSV),
475 "txt" => Ok(FileType::TSV),
476 _ => Err(format!("Unknown file type: {}", s)),
477 }
478 }
479}
480
481
482
483pub fn regions_to_lapper(regions: Vec<Region>) -> Result<HashMap<String, Lapper<usize, u32>>> {
484 let mut lapper: HashMap<String, Lapper<usize, u32>> = HashMap::default();
485 let mut intervals: HashMap<String, Vec<Iv>> = HashMap::default();
486
487 for reg in regions {
488 let chrom = reg.name().to_string();
489 let start = reg.start().map(|x| x.get());
490 let end = reg.end().map(|x| x.get());
491
492 let start = match start {
493 Bound::Included(start) => start,
494 Bound::Excluded(start) => start + 1,
495 _ => 0,
496 };
497
498 let end = match end {
499 Bound::Included(end) => end,
500 Bound::Excluded(end) => end - 1,
501 _ => 0,
502 };
503
504 let iv = Iv {
505 start: start.into(),
506 stop: end.into(),
507 val: 0,
508 };
509 intervals
510 .entry(chrom.clone())
511 .or_insert(Vec::new())
512 .push(iv);
513 }
514
515 for (chrom, ivs) in intervals.iter() {
516 let lap = Lapper::new(ivs.to_vec());
517 lapper.insert(chrom.clone(), lap);
518 }
519
520 Ok(lapper)
521}
522
523
524pub fn bed_to_lapper(bed: PathBuf) -> Result<HashMap<String, Lapper<usize, u32>>> {
525
526 let reader = std::fs::File::open(bed)?;
528 let mut buf_reader = std::io::BufReader::new(reader);
529 let mut bed_reader = bed::io::Reader::<4, _>::new(buf_reader);
530 let mut record = bed::Record::default();
531 let mut intervals: HashMap<String, Vec<Iv>> = HashMap::default();
532 let mut lapper: HashMap<String, Lapper<usize, u32>> = HashMap::default();
533
534
535 while bed_reader.read_record(&mut record)? != 0 {
536 let chrom = record.reference_sequence_name().to_string();
537 let start = record.feature_start()?;
538 let end = record.feature_end().context("Failed to get feature end")??;
539
540 let iv = Iv {
541 start: start.into(),
542 stop: end.into(),
543 val: 0,
544 };
545 intervals
546 .entry(chrom.clone())
547 .or_insert(Vec::new())
548 .push(iv);
549 }
550
551 for (chrom, ivs) in intervals.iter() {
552 let lap = Lapper::new(ivs.to_vec());
553 lapper.insert(chrom.clone(), lap);
554 }
555 if lapper.is_empty() {
557 return Err(anyhow::Error::msg("Lapper is empty"));
558 }
559
560
561 Ok(lapper)
562}
563
564
565pub fn lapper_chrom_name_to_lapper_chrom_id(
569 lapper: HashMap<String, Lapper<usize, u32>>,
570 bam_stats: &BamStats,
571) -> Result<HashMap<usize, Lapper<usize, u32>>> {
572
573 let chrom_id_mapping = bam_stats.chromosome_name_to_id_mapping()?;
575 let mut lapper_chrom_id: HashMap<usize, Lapper<usize, u32>> = HashMap::default();
576 for (chrom, lap) in lapper.iter() {
577 let chrom_id = chrom_id_mapping
578 .get(chrom)
579 .ok_or_else(|| anyhow::Error::msg(format!("Chromosome {} not found in BAM file", chrom)))?;
580 lapper_chrom_id.insert(*chrom_id, lap.clone());
581 }
582 Ok(lapper_chrom_id)
583}
584
585
586
587
588
589pub fn get_bam_header<P>(file_path: P) -> Result<sam::Header>
594where
595 P: AsRef<Path>,
596{
597 let file_path = file_path.as_ref();
598
599 if !file_path.exists() {
601 return Err(anyhow::Error::from(std::io::Error::new(
602 std::io::ErrorKind::NotFound,
603 format!("File not found: {}", file_path.display()),
604 )));
605 };
606
607 let mut reader = bam::io::indexed_reader::Builder::default()
608 .build_from_path(file_path)
609 .expect("Failed to open file");
610
611 let header = match reader.read_header() {
612 std::result::Result::Ok(header) => header,
613 Err(e) => {
614 debug!(
615 "Failed to read header using noodels falling back to samtools: {}",
616 e
617 );
618
619 let header_samtools = std::process::Command::new("samtools")
620 .arg("view")
621 .arg("-H")
622 .arg(file_path)
623 .output()
624 .expect("Failed to run samtools")
625 .stdout;
626
627 let header_str =
628 String::from_utf8(header_samtools).expect("Failed to convert header to string");
629
630 let header_string =
632 header_str.replace("@HD\tSO:coordinate\n", "@HD\tVN:1.6\tSO:coordinate\n");
633 let header_str = header_string.as_bytes();
634 let mut reader = sam::io::Reader::new(header_str);
635 let header = reader
636 .read_header()
637 .expect("Failed to read header with samtools");
638
639 header
640 }
641 };
642 Ok(header)
643}