use crate::cli::{PansnParameters, PgPansnOptions};
use crate::utils::bio_io;
use anyhow::Result;
use rust_htslib::bam::{self, Read};
use std::collections::HashMap;
use std::io::Write;
use std::sync::Once;
static ALIGNMENT_WARNING: Once = Once::new();
pub fn copy_header_from_bam(target_header: &mut bam::Header, source_bam_path: &str) -> Result<()> {
let source_reader = bio_io::bam_reader(source_bam_path);
let source_header = bam::Header::from_template(source_reader.header());
let source_hashmap = source_header.to_hashmap();
if source_hashmap.contains_key("RG") {
let mut target_hashmap = target_header.to_hashmap();
target_hashmap.remove("RG");
*target_header = crate::utils::bio_io::header_from_hashmap(target_hashmap);
}
for (record_type, records) in source_hashmap.iter() {
if record_type != "SQ" && record_type != "HD" {
for record in records.iter() {
let mut header_record = bam::header::HeaderRecord::new(record_type.as_bytes());
for (key, value) in record.iter() {
header_record.push_tag(key.as_bytes(), value);
}
target_header.push_record(&header_record);
}
}
}
for comment in source_header.comments() {
target_header.push_comment(comment.as_bytes());
}
log::info!(
"Copied header records from '{}' (excluding SQ and HD tags)",
source_bam_path
);
Ok(())
}
pub mod haplotype {
use super::*;
pub fn determine_haplotype(contig_name: &str, hap1_tag: &str, hap2_tag: &str) -> Option<u8> {
let has_hap1 = contig_name.contains(hap1_tag);
let has_hap2 = contig_name.contains(hap2_tag);
match (has_hap1, has_hap2) {
(true, false) => Some(1),
(false, true) => Some(2),
_ => None, }
}
pub fn build_haplotype_map(
header: &bam::Header,
hap1_tag: Option<&str>,
hap2_tag: Option<&str>,
) -> Result<Option<HashMap<i32, Option<u8>>>> {
if let (Some(hap1), Some(hap2)) = (hap1_tag, hap2_tag) {
log::info!("Building haplotype map with tags: '{hap1}' (hap1), '{hap2}' (hap2)");
let header_view = bam::HeaderView::from_header(header);
let mut map = HashMap::new();
for i in 0..header_view.target_count() {
let contig_name = std::str::from_utf8(header_view.target_names()[i as usize])
.map_err(|e| anyhow::anyhow!("Invalid contig name: {}", e))?;
let haplotype = determine_haplotype(contig_name, hap1, hap2);
map.insert(i as i32, haplotype);
if let Some(hp) = haplotype {
log::debug!("Contig '{contig_name}' -> haplotype {hp}");
}
}
Ok(Some(map))
} else {
Ok(None)
}
}
pub fn add_haplotype_tag(
record: &mut bam::Record,
haplotype_map: &HashMap<i32, Option<u8>>,
min_mapq: u8,
) -> Result<()> {
record.remove_aux(b"HP").unwrap_or(());
if record.is_secondary() || record.is_supplementary() {
super::ALIGNMENT_WARNING.call_once(|| {
log::warn!(
"Secondary alignments will not be haplotagged. Supplementary alignments, may be tagged with a different HP value than the primary alignment.",
);
});
}
if !record.is_secondary() && record.mapq() >= min_mapq {
let tid = record.tid();
if let Some(&Some(haplotype)) = haplotype_map.get(&tid) {
record.push_aux(b"HP", bam::record::Aux::U8(haplotype))?;
}
}
Ok(())
}
}
pub fn strip_pansn_from_contig_name(contig_name: &str, delimiter: char) -> String {
let mut del_count = 0;
let mut new_name = String::new();
for char in contig_name.chars() {
if del_count >= 2 {
new_name.push(char);
} else if char == delimiter {
del_count += 1;
}
}
new_name
}
pub fn apply_pansn_transformations(
header: &mut bam::Header,
params: &PansnParameters,
) -> Result<()> {
if let Some(ref prefix) = params.prefix {
log::info!("Adding panSN-spec prefix: {prefix}");
let original_comments: Vec<_> = header.comments().map(|c| c.to_string()).collect();
let mut header_hashmap = header.to_hashmap();
for (key, value) in header_hashmap.iter_mut() {
if key.eq("SQ") {
for sn_line in value.iter_mut() {
let name = sn_line
.get_mut("SN")
.expect("SN tag not found within an @SQ line");
let mut new_name = String::new();
new_name.push_str(prefix);
new_name.push_str(name);
name.clear();
name.push_str(&new_name);
}
}
}
*header = crate::utils::bio_io::header_from_hashmap(header_hashmap);
for comment in original_comments {
header.push_comment(comment.as_bytes());
}
} else if params.strip {
log::info!(
"Stripping panSN-spec with delimiter: {delimiter}",
delimiter = params.delimiter
);
let original_comments: Vec<_> = header.comments().map(|c| c.to_string()).collect();
let mut header_hashmap = header.to_hashmap();
for (key, value) in header_hashmap.iter_mut() {
if key.eq("SQ") {
for sn_line in value.iter_mut() {
let name = sn_line
.get_mut("SN")
.expect("SN tag not found within an @SQ line");
let mut del_count = 0;
let mut new_name = String::new();
for char in name.chars() {
if del_count >= 2 {
new_name.push(char);
} else if char == params.delimiter {
del_count += 1;
}
}
name.clear();
name.push_str(&new_name);
}
}
}
*header = crate::utils::bio_io::header_from_hashmap(header_hashmap);
for comment in original_comments {
header.push_comment(comment.as_bytes());
}
}
if let Some(ref copy_header_path) = params.copy_header {
copy_header_from_bam(header, copy_header_path)?;
}
Ok(())
}
pub fn run_pg_pansn(opts: &mut PgPansnOptions) -> Result<()> {
if !opts.pansn.has_operations() {
anyhow::bail!("Either --prefix/--strip, both --hap1-tag and --hap2-tag, or --copy-header must be provided");
}
let mut reader = opts.input.bam_reader();
let mut header = opts.input.header().clone();
apply_pansn_transformations(&mut header, &opts.pansn)?;
opts.input.header = Some(header);
let haplotype_map = haplotype::build_haplotype_map(
&opts.input.header(),
opts.pansn.hap1_tag.as_deref(),
opts.pansn.hap2_tag.as_deref(),
)?;
let mut writer = opts.input.bam_writer(&opts.out);
if let Some(header_out) = &opts.header_out {
let mut header_writer = bio_io::writer(header_out)?;
let header_string = bio_io::bam_header_to_string(writer.header());
header_writer.write_all(header_string.as_bytes())?;
log::info!("BAM header written to: {}", header_out);
}
for rec in reader.records() {
let mut record = rec?;
if let Some(ref hap_map) = haplotype_map {
haplotype::add_haplotype_tag(&mut record, hap_map, opts.pansn.min_mapq)?;
}
bio_io::write_record(&mut writer, &record)?;
}
Ok(())
}