use anyhow::Context;
use indicatif::{ProgressBar, ProgressStyle};
use slog::{crit, info};
use std::fs;
use std::fs::File;
use std::io::{stdout, BufReader, BufWriter, Cursor, Seek, SeekFrom, Write};
use libradicl::rad_types;
use libradicl::utils::MASK_LOWER_31_U32;
use needletail::bitkmer::*;
use rand::Rng;
use rust_htslib::bam::HeaderView;
use rust_htslib::{bam, bam::record::Aux, bam::Read};
use std::collections::HashMap;
use std::error::Error;
use std::path::Path;
use std::str;
#[allow(dead_code)]
fn get_random_nucl() -> &'static str {
let nucl = vec!["A", "T", "G", "C"];
let mut rng = rand::thread_rng();
let idx = rng.gen_range(0..4);
nucl[idx]
}
#[allow(dead_code)]
pub fn cb_string_to_u64(cb_str: &[u8]) -> Result<u64, Box<dyn Error>> {
let mut cb_id: u64 = 0;
for (idx, nt) in cb_str.iter().rev().enumerate() {
let offset = idx * 2;
match nt {
65 | 78 => (), 67 => cb_id |= 1 << offset, 71 => cb_id |= 2 << offset, 84 => cb_id |= 3 << offset, _ => panic!("unknown nucleotide {}", nt),
};
}
Ok(cb_id)
}
#[allow(dead_code)]
pub fn tid_2_contig(h: &HeaderView) -> HashMap<u32, String> {
let mut dict: HashMap<u32, String> = HashMap::with_capacity(46);
for (i, t) in h
.target_names()
.iter()
.map(|a| str::from_utf8(a).unwrap())
.enumerate()
{
dict.insert(i as u32, t.to_owned());
}
dict
}
pub fn bam2rad<P1, P2>(input_file: P1, rad_file: P2, num_threads: u32, log: &slog::Logger)
where
P1: AsRef<Path>,
P2: AsRef<Path>,
{
let oname = Path::new(rad_file.as_ref());
let parent = oname.parent().unwrap();
std::fs::create_dir_all(parent).unwrap();
if oname.exists() {
std::fs::remove_file(oname).expect("could not be deleted");
}
let ofile = File::create(rad_file.as_ref()).unwrap();
let mut bam = bam::Reader::from_path(&input_file).unwrap();
let bam_bytes = fs::metadata(&input_file).unwrap().len();
info! {
log,
"Bam file size in bytes {:?}",
bam_bytes
};
if num_threads > 1 {
bam.set_threads((num_threads as usize) - 1).unwrap();
} else {
bam.set_threads(1).unwrap();
}
let hdrv = bam.header().to_owned();
let mut data = Cursor::new(vec![]);
let mut owriter = BufWriter::with_capacity(1048576, ofile);
{
let is_paired = 0u8;
data.write_all(&is_paired.to_le_bytes())
.expect("couldn't write to output file");
let ref_count = hdrv.target_count() as u64;
data.write_all(&ref_count.to_le_bytes())
.expect("couldn't write to output file");
for t in hdrv.target_names().iter() {
let name_size = t.len() as u16;
data.write_all(&name_size.to_le_bytes())
.expect("coudn't write to output file");
data.write_all(t).expect("coudn't write to output file");
}
let initial_num_chunks = 0u64;
data.write_all(&initial_num_chunks.to_le_bytes())
.expect("coudn't write to output file");
}
{
info!(log, "ref count: {:?} ", hdrv.target_count(),);
}
let end_header_pos =
data.seek(SeekFrom::Current(0)).unwrap() - std::mem::size_of::<u64>() as u64;
info!(log, "end header pos: {:?}", end_header_pos,);
let mut rec = bam::Record::new();
let first_record_exists = bam.read(&mut rec).is_some();
if !first_record_exists {
crit!(log, "bam file had no records!");
std::process::exit(1);
}
{
let mut num_tags = 2u16;
data.write_all(&num_tags.to_le_bytes())
.expect("coudn't write to output file");
let mut typeid = 2u8;
let mut cb_tag_str = "cblen";
let mut umi_tag_str = "ulen";
rad_types::write_str_bin(cb_tag_str, &rad_types::RadIntId::U16, &mut data);
data.write_all(&typeid.to_le_bytes())
.expect("coudn't write to output file");
rad_types::write_str_bin(umi_tag_str, &rad_types::RadIntId::U16, &mut data);
data.write_all(&typeid.to_le_bytes())
.expect("coudn't write to output file");
let bc_string_in: &str = if let Ok(Aux::String(bcs)) = rec.aux(b"CR") {
bcs
} else {
panic!("Input record missing CR tag!")
};
let umi_string_in: &str = if let Ok(Aux::String(umis)) = rec.aux(b"UR") {
umis
} else {
panic!("Input record missing UR tag!")
};
let bclen = bc_string_in.len() as u16;
let umilen = umi_string_in.len() as u16;
data.write_all(&num_tags.to_le_bytes())
.expect("coudn't write to output file");
cb_tag_str = "b";
umi_tag_str = "u";
let bc_typeid = match bclen {
1..=4 => rad_types::encode_type_tag(rad_types::RadType::U8).unwrap(),
5..=8 => rad_types::encode_type_tag(rad_types::RadType::U16).unwrap(),
9..=16 => rad_types::encode_type_tag(rad_types::RadType::U32).unwrap(),
17..=32 => rad_types::encode_type_tag(rad_types::RadType::U64).unwrap(),
l => {
crit!(log, "cannot encode barcode of length {} > 32", l);
std::process::exit(1);
}
};
let umi_typeid = match umilen {
1..=4 => rad_types::encode_type_tag(rad_types::RadType::U8).unwrap(),
5..=8 => rad_types::encode_type_tag(rad_types::RadType::U16).unwrap(),
9..=16 => rad_types::encode_type_tag(rad_types::RadType::U32).unwrap(),
17..=32 => rad_types::encode_type_tag(rad_types::RadType::U64).unwrap(),
l => {
crit!(log, "cannot encode umi of length {} > 32", l);
std::process::exit(1);
}
};
rad_types::write_str_bin(cb_tag_str, &rad_types::RadIntId::U16, &mut data);
data.write_all(&bc_typeid.to_le_bytes())
.expect("coudn't write to output file");
rad_types::write_str_bin(umi_tag_str, &rad_types::RadIntId::U16, &mut data);
data.write_all(&umi_typeid.to_le_bytes())
.expect("coudn't write to output file");
num_tags = 1u16;
data.write_all(&num_tags.to_le_bytes())
.expect("couldn't write to output file");
let refid_str = "compressed_ori_refid";
typeid = 3u8;
rad_types::write_str_bin(refid_str, &rad_types::RadIntId::U16, &mut data);
data.write_all(&typeid.to_le_bytes())
.expect("coudn't write to output file");
data.write_all(&bclen.to_le_bytes())
.expect("coudn't write to output file");
data.write_all(&umilen.to_le_bytes())
.expect("coudn't write to output file");
}
owriter.write_all(data.get_ref()).unwrap();
let mut num_output_chunks = 0u64;
let mut local_nrec = 0u32;
let buf_limit = 10000u32;
data = Cursor::new(Vec::<u8>::with_capacity((buf_limit * 24) as usize));
data.write_all(&local_nrec.to_le_bytes()).unwrap();
data.write_all(&local_nrec.to_le_bytes()).unwrap();
let sty = ProgressStyle::default_bar()
.template(
"{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos:>7}/{len:7} {msg}",
)
.expect("ProgressStyle template was invalid.")
.progress_chars("╢▌▌░╟");
let expected_bar_length = bam_bytes / ((buf_limit as u64) * 24);
let pbar_inner = ProgressBar::new(expected_bar_length);
pbar_inner.set_style(sty);
pbar_inner.tick();
let mut old_qname = String::from("");
let mut bc = 0u64;
let mut umi = 0u64;
let mut tid_list = Vec::<u32>::new();
let mut first_pass = true;
loop {
if !first_pass {
let next_record_exists = bam.read(&mut rec).is_some();
if !next_record_exists {
break;
}
}
first_pass = false;
let is_reverse = rec.is_reverse();
let qname_str = str::from_utf8(rec.qname()).unwrap().to_owned();
let qname = qname_str;
let mut tid = rec.tid() as u32;
if qname == old_qname {
if !is_reverse {
tid |= MASK_LOWER_31_U32;
}
tid_list.push(tid);
continue;
}
if !tid_list.is_empty() {
assert!(!tid_list.is_empty(), "Trying to write empty tid_list");
let na = tid_list.len();
data.write_all(&(na as u32).to_le_bytes()).unwrap();
data.write_all(&(bc as u32).to_le_bytes()).unwrap();
data.write_all(&(umi as u32).to_le_bytes()).unwrap();
for t in tid_list.iter() {
data.write_all(&t.to_le_bytes()).unwrap();
}
}
if local_nrec > buf_limit {
data.set_position(0);
let nbytes = (data.get_ref().len()) as u32;
let nrec = local_nrec;
data.write_all(&nbytes.to_le_bytes()).unwrap();
data.write_all(&nrec.to_le_bytes()).unwrap();
owriter.write_all(data.get_ref()).unwrap();
pbar_inner.inc(1);
num_output_chunks += 1;
local_nrec = 0;
data = Cursor::new(Vec::<u8>::with_capacity((buf_limit * 24) as usize));
data.write_all(&local_nrec.to_le_bytes()).unwrap();
data.write_all(&local_nrec.to_le_bytes()).unwrap();
}
{
let bc_string_in: &str = if let Ok(Aux::String(bcs)) = rec.aux(b"CR") {
bcs
} else {
panic!("Input record missing CR tag!")
};
let umi_string_in: &str = if let Ok(Aux::String(umis)) = rec.aux(b"UR") {
umis
} else {
panic!("Input record missing UR tag!")
};
let bc_string = bc_string_in.replacen('N', "A", 1);
let umi_string = umi_string_in.replacen('N', "A", 1);
if let Some(_pos) = bc_string.find('N') {
continue;
}
if let Some(_pos) = umi_string.find('N') {
continue;
}
bc = cb_string_to_u64(bc_string.as_bytes()).unwrap();
umi = cb_string_to_u64(umi_string.as_bytes()).unwrap();
old_qname = qname.clone();
tid_list.clear();
if !is_reverse {
tid |= MASK_LOWER_31_U32;
}
tid_list.push(tid);
local_nrec += 1;
}
}
if local_nrec > 0 {
if !tid_list.is_empty() {
assert!(!tid_list.is_empty(), "Trying to write empty tid_list");
let na = tid_list.len();
data.write_all(&(na as u32).to_le_bytes()).unwrap();
data.write_all(&(bc as u32).to_le_bytes()).unwrap();
data.write_all(&(umi as u32).to_le_bytes()).unwrap();
for t in tid_list.iter() {
data.write_all(&t.to_le_bytes()).unwrap();
}
}
data.set_position(0);
let nbytes = (data.get_ref().len()) as u32;
let nrec = local_nrec;
data.write_all(&nbytes.to_le_bytes()).unwrap();
data.write_all(&nrec.to_le_bytes()).unwrap();
owriter.write_all(data.get_ref()).unwrap();
num_output_chunks += 1;
}
pbar_inner.finish_with_message("wrote all records.");
println!();
info!(log, "{:?} chunks written", num_output_chunks,);
owriter.flush().expect("File buffer could not be flushed");
owriter
.seek(SeekFrom::Start(end_header_pos))
.expect("couldn't seek in output file");
owriter
.write_all(&num_output_chunks.to_le_bytes())
.expect("couldn't write to output file.");
info!(log, "finished writing to {:?}.", rad_file.as_ref());
}
pub fn view<P>(rad_file: P, print_header: bool, log: &slog::Logger)
where
P: AsRef<Path>,
{
let _read_num = view2(rad_file, print_header, log).unwrap();
}
pub fn view2<P>(rad_file: P, print_header: bool, log: &slog::Logger) -> anyhow::Result<u64>
where
P: AsRef<Path>,
{
let i_file = File::open(rad_file).unwrap();
let mut br = BufReader::new(i_file);
let hdr = rad_types::RadHeader::from_bytes(&mut br);
let _fl_tags = rad_types::TagSection::from_bytes(&mut br);
let rl_tags = rad_types::TagSection::from_bytes(&mut br);
const BNAME: &str = "b";
const UNAME: &str = "u";
let mut bct: Option<u8> = None;
let mut umit: Option<u8> = None;
for rt in &rl_tags.tags {
if rt.name == BNAME || rt.name == UNAME {
if rad_types::decode_int_type_tag(rt.typeid).is_none() {
crit!(
log,
"currently only RAD types 1--4 are supported for 'b' and 'u' tags."
);
std::process::exit(libradicl::exit_codes::EXIT_UNSUPPORTED_TAG_TYPE);
}
if rt.name == BNAME {
bct = Some(rt.typeid);
}
if rt.name == UNAME {
umit = Some(rt.typeid);
}
}
}
let _al_tags = rad_types::TagSection::from_bytes(&mut br);
let ft_vals = rad_types::FileTags::from_bytes(&mut br);
let mut num_reads: u64 = 0;
let bc_type = rad_types::decode_int_type_tag(bct.expect("no barcode tag description present."))
.context("unknown barcode type id.")?;
let umi_type = rad_types::decode_int_type_tag(umit.expect("no umi tag description present"))
.context("unknown barcode type id.")?;
let stdout = stdout(); let stdout_l = stdout.lock();
let mut handle = BufWriter::new(stdout_l);
if print_header {
for i in 0usize..hdr.ref_names.len() {
match writeln!(handle, "{}:{}", i, hdr.ref_names[i]) {
Ok(_) => {}
Err(_) => {
return Ok(i as u64);
}
};
}
}
let mut id = 0usize;
for _ in 0..(hdr.num_chunks as usize) {
let c = rad_types::Chunk::from_bytes(&mut br, &bc_type, &umi_type);
for read in c.reads.iter() {
let bc_mer: BitKmer = (read.bc, ft_vals.bclen as u8);
let umi_mer: BitKmer = (read.umi, ft_vals.umilen as u8);
let num_entries = read.refs.len();
for i in 0usize..num_entries {
let tid = &hdr.ref_names[read.refs[i] as usize];
match writeln!(
handle,
"ID:{}\tHI:{}\tNH:{}\tCB:{}\tUMI:{}\tDIR:{:?}\t{}",
id,
i,
num_entries,
unsafe { std::str::from_utf8_unchecked(&bitmer_to_bytes(bc_mer)[..]) },
unsafe { std::str::from_utf8_unchecked(&bitmer_to_bytes(umi_mer)[..]) },
read.dirs[i],
tid,
) {
Ok(_) => {
num_reads += 1;
}
Err(_) => {
return Ok(num_reads);
}
};
}
id += 1;
}
}
Ok(num_reads)
}