use crate::bam_io::{BamReaderAuto, create_bam_reader, create_raw_bam_writer, is_stdin_path};
use crate::batched_sam_reader::BatchedSamReader;
use crate::commands::command::Command;
use crate::commands::common::{CompressionOptions, parse_bool};
use crate::logging::OperationTimer;
use crate::progress::ProgressTracker;
use crate::reference::find_dict_path;
use crate::sam::{
SamTag, buf_value_to_smallest_signed_int, check_sort, revcomp_buf_value, reverse_buf_value,
unclipped_five_prime_position,
};
use crate::sort::{PA_TAG, PrimaryAlignmentInfo, bam_fields};
use crate::template::{Template, TemplateIterator};
use crate::umi::TagInfo;
use crate::validation::validate_file_exists;
use crate::vendored::encode_record_buf;
use anyhow::{Context, Result};
use bstr::ByteSlice;
use clap::Parser;
use log::{debug, info, warn};
use noodles::sam::Header;
use noodles::sam::alignment::record::data::field::Tag;
use noodles::sam::alignment::record_buf::Data;
use noodles::sam::alignment::record_buf::RecordBuf;
use noodles::sam::alignment::record_buf::data::field::Value as BufValue;
use std::collections::HashSet;
use std::io::{BufRead, BufReader};
use std::path::{Path, PathBuf};
#[derive(Parser, Debug)]
#[command(
name = "zipper",
author,
version,
about = "\x1b[38;5;72m[ALIGNMENT]\x1b[0m \x1b[36mZip unmapped BAM with aligned BAM\x1b[0m",
long_about = r#"
Merges unmapped and mapped BAM files, transferring tags and metadata.
Takes an unmapped BAM (typically from FASTQ) and a mapped BAM (after alignment) and merges
them, copying tags from the unmapped to mapped reads. Both BAMs must be queryname sorted or
grouped, and have the same read name ordering.
The tool transfers tags from the unmapped reads to their corresponding mapped reads. For reads
mapped to the negative strand, tags can be optionally reversed or reverse-complemented. All
QC pass/fail flags are also transferred from the unmapped to mapped reads.
## Tag Manipulation
You can specify which tags to manipulate for reads mapped to the negative strand:
- --tags-to-reverse: Reverses array and string tags (e.g., [1,2,3] becomes [3,2,1])
- --tags-to-revcomp: Reverse complements sequence tags (e.g., AGAGG becomes CCTCT)
Named tag sets like "Consensus" are automatically expanded to their constituent tags:
- Consensus: aD bD cD aM bM cM aE bE cE ad bd cd ae be ce ac bc
## Default Behavior
By default, input is read from stdin and output is written to stdout, allowing for streaming
workflows like:
bwa mem -t 8 -p -K 150000000 -Y ref.fa reads.fq | fgumi zipper -u unmapped.bam -r ref.fa | fgumi sort -i /dev/stdin -o output.bam --order template-coordinate
"#
)]
#[command(verbatim_doc_comment)]
pub struct Zipper {
#[arg(short = 'i', long, default_value = "-")]
pub input: PathBuf,
#[arg(short = 'u', long)]
pub unmapped: PathBuf,
#[arg(short = 'r', long)]
pub reference: PathBuf,
#[arg(short = 'o', long, default_value = "-")]
pub output: PathBuf,
#[arg(long, value_delimiter = ',')]
pub tags_to_remove: Vec<String>,
#[arg(long, value_delimiter = ',')]
pub tags_to_reverse: Vec<String>,
#[arg(long, value_delimiter = ',')]
pub tags_to_revcomp: Vec<String>,
#[arg(short = 'b', long, default_value = "50000")]
pub buffer: usize,
#[arg(long, short = 't', default_value = "1")]
pub threads: usize,
#[command(flatten)]
pub compression: CompressionOptions,
#[arg(short = 'K', long = "bwa-chunk-size", default_value = "150000000")]
pub bwa_chunk_size: u64,
#[arg(long = "exclude-missing-reads", default_value = "false", num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set, value_parser = parse_bool)]
pub exclude_missing_reads: bool,
#[arg(long = "skip-pa-tags", default_value = "false", num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set, value_parser = parse_bool)]
pub skip_pa_tags: bool,
}
pub fn build_output_header(unmapped: &Header, mapped: &Header, dict_path: &Path) -> Result<Header> {
let dict_file =
std::fs::File::open(dict_path).context("Failed to open reference dictionary")?;
let mut dict_reader = noodles::sam::io::Reader::new(std::io::BufReader::new(dict_file));
let dict_header = dict_reader.read_header()?;
let mut header = Header::builder();
for (name, map) in dict_header.reference_sequences() {
header = header.add_reference_sequence(name.clone(), map.clone());
}
for comment in unmapped.comments() {
header = header.add_comment(comment.clone());
}
for comment in mapped.comments() {
header = header.add_comment(comment.clone());
}
let mut rg_ids = HashSet::new();
for (id, rg) in mapped.read_groups() {
header = header.add_read_group(id.clone(), rg.clone());
rg_ids.insert(id.clone());
}
for (id, rg) in unmapped.read_groups() {
if !rg_ids.contains(id) {
header = header.add_read_group(id.clone(), rg.clone());
}
}
let mut pg_ids = HashSet::new();
for (id, pg) in mapped.programs().as_ref() {
header = header.add_program(id.clone(), pg.clone());
pg_ids.insert(id.clone());
}
for (id, pg) in unmapped.programs().as_ref() {
if !pg_ids.contains(id) {
header = header.add_program(id.clone(), pg.clone());
}
}
if let Some(hdr) = unmapped.header() {
header = header.set_header(hdr.clone());
}
Ok(header.build())
}
pub fn copy_tags(src: &RecordBuf, dest: &mut Data, tag_info: &TagInfo) -> Result<()> {
for (tag, value) in src.data().iter() {
let tag_str = format!("{}{}", tag.as_ref()[0] as char, tag.as_ref()[1] as char);
if tag_str == "PG" && dest.get(&tag).is_some() {
continue;
}
if tag_info.remove.contains(&tag_str) {
continue;
}
dest.insert(tag, value.clone());
}
Ok(())
}
fn add_primary_alignment_tags(template: &mut Template) {
let has_sec_supp =
template.records.iter().any(|r| r.flags().is_secondary() || r.flags().is_supplementary());
if !has_sec_supp {
return; }
let r1_primary_info: Option<(i32, i32, bool)> = template.r1().and_then(|r| {
if r.flags().is_unmapped() {
return None;
}
let ref_id: i32 = r.reference_sequence_id()?.try_into().ok()?;
let pos: i32 = unclipped_five_prime_position(r)?.try_into().ok()?;
let is_reverse = r.flags().is_reverse_complemented();
Some((ref_id, pos, is_reverse))
});
let r2_primary_info: Option<(i32, i32, bool)> = template.r2().and_then(|r| {
if r.flags().is_unmapped() {
return None;
}
let ref_id: i32 = r.reference_sequence_id()?.try_into().ok()?;
let pos: i32 = unclipped_five_prime_position(r)?.try_into().ok()?;
let is_reverse = r.flags().is_reverse_complemented();
Some((ref_id, pos, is_reverse))
});
let pa_info: Option<PrimaryAlignmentInfo> = match (r1_primary_info, r2_primary_info) {
(Some((tid1, pos1, neg1)), Some((tid2, pos2, neg2))) => {
if (tid1, pos1) <= (tid2, pos2) {
Some(PrimaryAlignmentInfo::new(tid1, pos1, neg1, tid2, pos2, neg2))
} else {
Some(PrimaryAlignmentInfo::new(tid2, pos2, neg2, tid1, pos1, neg1))
}
}
(Some((tid, pos, neg)), None) | (None, Some((tid, pos, neg))) => {
Some(PrimaryAlignmentInfo::new(tid, pos, neg, tid, pos, neg))
}
(None, None) => None,
};
let Some(pa_info) = pa_info else {
return;
};
let pa_value = pa_info.to_tag_value();
for record in &mut template.records {
let flags = record.flags();
if !flags.is_secondary() && !flags.is_supplementary() {
continue;
}
record.data_mut().insert(PA_TAG, pa_value.clone());
}
}
pub fn merge(
unmapped: &Template,
mapped: &mut Template,
tag_info: &TagInfo,
skip_pa_tags: bool,
) -> Result<()> {
mapped.fix_mate_info()?;
for i in 0..mapped.read_count() {
for tag_str in &tag_info.remove {
if tag_str.len() == 2 {
let tag = Tag::new(tag_str.as_bytes()[0], tag_str.as_bytes()[1]);
let _ = mapped.records[i].data_mut().remove(&tag);
}
}
}
for u in unmapped.primary_reads() {
let is_unpaired = !u.flags().is_segmented();
let is_first = u.flags().is_first_segment();
let mapped_indices: Vec<usize> = if is_unpaired || is_first {
mapped
.records
.iter()
.enumerate()
.filter(|(_, r)| !r.flags().is_segmented() || r.flags().is_first_segment())
.map(|(i, _)| i)
.collect()
} else {
mapped
.records
.iter()
.enumerate()
.filter(|(_, r)| r.flags().is_segmented() && !r.flags().is_first_segment())
.map(|(i, _)| i)
.collect()
};
let has_pos_reads =
mapped_indices.iter().any(|&i| !mapped.records[i].flags().is_reverse_complemented());
let has_neg_reads =
mapped_indices.iter().any(|&i| mapped.records[i].flags().is_reverse_complemented());
if has_pos_reads {
for &i in &mapped_indices {
if !mapped.records[i].flags().is_reverse_complemented() {
copy_tags(u, mapped.records[i].data_mut(), tag_info)?;
}
}
}
if has_neg_reads {
let mut u_for_neg = u.clone();
if tag_info.has_revs_or_revcomps() {
for (tag, value) in u.data().iter() {
let tag_str = format!("{}{}", tag.as_ref()[0] as char, tag.as_ref()[1] as char);
if tag_info.reverse.contains(&tag_str) {
let new_val = reverse_buf_value(value);
u_for_neg.data_mut().insert(tag, new_val);
} else if tag_info.revcomp.contains(&tag_str) {
let new_val = revcomp_buf_value(value);
u_for_neg.data_mut().insert(tag, new_val);
}
}
}
for &i in &mapped_indices {
if mapped.records[i].flags().is_reverse_complemented() {
copy_tags(&u_for_neg, mapped.records[i].data_mut(), tag_info)?;
}
}
}
let is_qc_fail = u.flags().is_qc_fail();
for &i in &mapped_indices {
let mut flags = mapped.records[i].flags();
if is_qc_fail {
flags.insert(noodles::sam::alignment::record::Flags::QC_FAIL);
} else {
flags.remove(noodles::sam::alignment::record::Flags::QC_FAIL);
}
*mapped.records[i].flags_mut() = flags;
}
}
let as_tag = Tag::from(SamTag::AS);
let xs_tag = Tag::from(SamTag::XS);
for i in 0..mapped.read_count() {
if let Some(value) = mapped.records[i].data().get(&as_tag) {
if let Some(int8_value) = buf_value_to_smallest_signed_int(value) {
mapped.records[i].data_mut().insert(as_tag, int8_value);
}
}
if let Some(value) = mapped.records[i].data().get(&xs_tag) {
if let Some(int8_value) = buf_value_to_smallest_signed_int(value) {
mapped.records[i].data_mut().insert(xs_tag, int8_value);
}
}
}
if !skip_pa_tags {
add_primary_alignment_tags(mapped);
}
Ok(())
}
fn append_buf_value_raw(dest: &mut Vec<u8>, tag: [u8; 2], value: &BufValue) {
use noodles::sam::alignment::record_buf::data::field::value::Array;
match value {
BufValue::Character(c) => {
dest.push(tag[0]);
dest.push(tag[1]);
dest.push(b'A');
dest.push(*c);
}
BufValue::Int8(v) => {
dest.push(tag[0]);
dest.push(tag[1]);
dest.push(b'c');
dest.push(v.cast_unsigned());
}
BufValue::UInt8(v) => {
dest.push(tag[0]);
dest.push(tag[1]);
dest.push(b'C');
dest.push(*v);
}
BufValue::Int16(v) => {
dest.push(tag[0]);
dest.push(tag[1]);
dest.push(b's');
dest.extend_from_slice(&v.to_le_bytes());
}
BufValue::UInt16(v) => {
dest.push(tag[0]);
dest.push(tag[1]);
dest.push(b'S');
dest.extend_from_slice(&v.to_le_bytes());
}
BufValue::Int32(v) => {
dest.push(tag[0]);
dest.push(tag[1]);
dest.push(b'i');
dest.extend_from_slice(&v.to_le_bytes());
}
BufValue::UInt32(v) => {
dest.push(tag[0]);
dest.push(tag[1]);
dest.push(b'I');
dest.extend_from_slice(&v.to_le_bytes());
}
BufValue::Float(v) => {
dest.push(tag[0]);
dest.push(tag[1]);
dest.push(b'f');
dest.extend_from_slice(&v.to_le_bytes());
}
BufValue::String(s) => {
dest.push(tag[0]);
dest.push(tag[1]);
dest.push(b'Z');
dest.extend_from_slice(s.as_bytes());
dest.push(0);
}
BufValue::Hex(s) => {
dest.push(tag[0]);
dest.push(tag[1]);
dest.push(b'H');
dest.extend_from_slice(s.as_bytes());
dest.push(0);
}
BufValue::Array(arr) => {
dest.push(tag[0]);
dest.push(tag[1]);
dest.push(b'B');
match arr {
Array::Int8(vals) => {
dest.push(b'c');
let count = u32::try_from(vals.len()).expect("BAM B-array length exceeds u32");
dest.extend_from_slice(&count.to_le_bytes());
for v in vals {
dest.push(v.cast_unsigned());
}
}
Array::UInt8(vals) => {
dest.push(b'C');
let count = u32::try_from(vals.len()).expect("BAM B-array length exceeds u32");
dest.extend_from_slice(&count.to_le_bytes());
dest.extend_from_slice(vals.as_slice());
}
Array::Int16(vals) => {
dest.push(b's');
let count = u32::try_from(vals.len()).expect("BAM B-array length exceeds u32");
dest.extend_from_slice(&count.to_le_bytes());
for v in vals {
dest.extend_from_slice(&v.to_le_bytes());
}
}
Array::UInt16(vals) => {
dest.push(b'S');
let count = u32::try_from(vals.len()).expect("BAM B-array length exceeds u32");
dest.extend_from_slice(&count.to_le_bytes());
for v in vals {
dest.extend_from_slice(&v.to_le_bytes());
}
}
Array::Int32(vals) => {
dest.push(b'i');
let count = u32::try_from(vals.len()).expect("BAM B-array length exceeds u32");
dest.extend_from_slice(&count.to_le_bytes());
for v in vals {
dest.extend_from_slice(&v.to_le_bytes());
}
}
Array::UInt32(vals) => {
dest.push(b'I');
let count = u32::try_from(vals.len()).expect("BAM B-array length exceeds u32");
dest.extend_from_slice(&count.to_le_bytes());
for v in vals {
dest.extend_from_slice(&v.to_le_bytes());
}
}
Array::Float(vals) => {
dest.push(b'f');
let count = u32::try_from(vals.len()).expect("BAM B-array length exceeds u32");
dest.extend_from_slice(&count.to_le_bytes());
for v in vals {
dest.extend_from_slice(&v.to_le_bytes());
}
}
}
}
}
}
fn add_primary_alignment_tags_raw(mapped: &mut Template) {
let rr = match mapped.raw_records.as_ref() {
Some(rr) => rr,
None => return,
};
let has_sec_supp = rr.iter().any(|r| {
let f = bam_fields::flags(r);
(f & bam_fields::flags::SECONDARY) != 0 || (f & bam_fields::flags::SUPPLEMENTARY) != 0
});
if !has_sec_supp {
return;
}
let r1_info: Option<(i32, i32, bool)> = mapped.r1.and_then(|(i, _)| {
let r = &rr[i];
let f = bam_fields::flags(r);
if (f & bam_fields::flags::UNMAPPED) != 0 {
return None;
}
let ref_id = bam_fields::ref_id(r);
let pos = bam_fields::unclipped_5prime_from_raw_bam(r);
let is_reverse = (f & bam_fields::flags::REVERSE) != 0;
Some((ref_id, pos, is_reverse))
});
let r2_info: Option<(i32, i32, bool)> = mapped.r2.and_then(|(i, _)| {
let r = &rr[i];
let f = bam_fields::flags(r);
if (f & bam_fields::flags::UNMAPPED) != 0 {
return None;
}
let ref_id = bam_fields::ref_id(r);
let pos = bam_fields::unclipped_5prime_from_raw_bam(r);
let is_reverse = (f & bam_fields::flags::REVERSE) != 0;
Some((ref_id, pos, is_reverse))
});
let pa_info: Option<PrimaryAlignmentInfo> = match (r1_info, r2_info) {
(Some((t1, p1, n1)), Some((t2, p2, n2))) => {
if (t1, p1) <= (t2, p2) {
Some(PrimaryAlignmentInfo::new(t1, p1, n1, t2, p2, n2))
} else {
Some(PrimaryAlignmentInfo::new(t2, p2, n2, t1, p1, n1))
}
}
(Some((t, p, n)), None) | (None, Some((t, p, n))) => {
Some(PrimaryAlignmentInfo::new(t, p, n, t, p, n))
}
(None, None) => None,
};
let Some(pa_info) = pa_info else {
return;
};
let pa_tag = b"pa";
let pa_values = [
pa_info.tid1,
pa_info.pos1,
i32::from(pa_info.neg1),
pa_info.tid2,
pa_info.pos2,
i32::from(pa_info.neg2),
];
let rr = mapped.raw_records.as_mut().unwrap();
for record in rr.iter_mut() {
let f = bam_fields::flags(record);
if (f & bam_fields::flags::SECONDARY) != 0 || (f & bam_fields::flags::SUPPLEMENTARY) != 0 {
bam_fields::remove_tag(record, pa_tag);
bam_fields::append_i32_array_tag(record, pa_tag, &pa_values);
}
}
}
fn convert_template_to_raw(template: &mut Template, header: &Header) -> Result<()> {
if template.is_raw_byte_mode() {
return Ok(());
}
let mut raw_records = Vec::with_capacity(template.records.len());
for record in &template.records {
let mut buf = Vec::with_capacity(256);
encode_record_buf(&mut buf, header, record)
.map_err(|e| anyhow::anyhow!("Failed to encode record: {e}"))?;
raw_records.push(buf);
}
template.raw_records = Some(raw_records);
Ok(())
}
fn collect_mapped_indices(mapped: &Template, is_first_segment: bool) -> Vec<usize> {
let mut indices = Vec::with_capacity(4);
if is_first_segment {
if let Some((i, _)) = mapped.r1 {
indices.push(i);
}
if let Some((s, e)) = mapped.r1_supplementals {
indices.extend(s..e);
}
if let Some((s, e)) = mapped.r1_secondaries {
indices.extend(s..e);
}
} else {
if let Some((i, _)) = mapped.r2 {
indices.push(i);
}
if let Some((s, e)) = mapped.r2_supplementals {
indices.extend(s..e);
}
if let Some((s, e)) = mapped.r2_secondaries {
indices.extend(s..e);
}
}
indices
}
pub fn merge_raw(
unmapped: &Template,
mapped: &mut Template,
tag_info: &TagInfo,
skip_pa_tags: bool,
) -> Result<()> {
mapped.fix_mate_info_raw()?;
let rr = mapped
.raw_records
.as_mut()
.ok_or_else(|| anyhow::anyhow!("merge_raw: mapped not in raw mode"))?;
for record in rr.iter_mut() {
for tag_str in &tag_info.remove {
if tag_str.len() == 2 {
let tag_bytes: [u8; 2] = [tag_str.as_bytes()[0], tag_str.as_bytes()[1]];
bam_fields::remove_tag(record, &tag_bytes);
}
}
}
let has_transforms = tag_info.has_revs_or_revcomps();
let pg_tag: [u8; 2] = [b'P', b'G'];
for u in unmapped.primary_reads() {
let is_unpaired = !u.flags().is_segmented();
let is_first = u.flags().is_first_segment();
let mapped_indices = collect_mapped_indices(mapped, is_unpaired || is_first);
let (has_pos, has_neg) = {
let rr = mapped.raw_records.as_ref().unwrap();
let mut pos = false;
let mut neg = false;
for &i in &mapped_indices {
if (bam_fields::flags(&rr[i]) & bam_fields::flags::REVERSE) == 0 {
pos = true;
} else {
neg = true;
}
if pos && neg {
break;
}
}
(pos, neg)
};
if has_pos {
let rr = mapped.raw_records.as_mut().unwrap();
for &i in &mapped_indices {
if (bam_fields::flags(&rr[i]) & bam_fields::flags::REVERSE) != 0 {
continue;
}
let aux = bam_fields::aux_data_slice(&rr[i]);
let has_pg = bam_fields::find_tag_type(aux, &pg_tag).is_some();
for (tag, value) in u.data().iter() {
let tag_bytes: [u8; 2] = [tag.as_ref()[0], tag.as_ref()[1]];
if tag_bytes == pg_tag && has_pg {
continue;
}
let tag_str = std::str::from_utf8(&tag_bytes).unwrap_or("");
if tag_info.remove.contains(tag_str) {
continue;
}
bam_fields::remove_tag(&mut rr[i], &tag_bytes);
append_buf_value_raw(&mut rr[i], tag_bytes, value);
}
}
}
if has_neg {
let rr = mapped.raw_records.as_mut().unwrap();
for &i in &mapped_indices {
if (bam_fields::flags(&rr[i]) & bam_fields::flags::REVERSE) == 0 {
continue;
}
let aux = bam_fields::aux_data_slice(&rr[i]);
let has_pg = bam_fields::find_tag_type(aux, &pg_tag).is_some();
let aux_offset =
bam_fields::aux_data_offset_from_record(&rr[i]).unwrap_or(rr[i].len());
for (tag, value) in u.data().iter() {
let tag_bytes: [u8; 2] = [tag.as_ref()[0], tag.as_ref()[1]];
if tag_bytes == pg_tag && has_pg {
continue;
}
let tag_str = std::str::from_utf8(&tag_bytes).unwrap_or("");
if tag_info.remove.contains(tag_str) {
continue;
}
bam_fields::remove_tag(&mut rr[i], &tag_bytes);
append_buf_value_raw(&mut rr[i], tag_bytes, value);
if has_transforms && tag_info.reverse.contains(tag_str) {
reverse_tag_in_place_raw(&mut rr[i], aux_offset, tag_bytes, value);
} else if has_transforms && tag_info.revcomp.contains(tag_str) {
revcomp_tag_in_place_raw(&mut rr[i], aux_offset, tag_bytes, value);
}
}
}
}
let is_qc_fail = u.flags().is_qc_fail();
let rr = mapped.raw_records.as_mut().unwrap();
for &i in &mapped_indices {
let mut f = bam_fields::flags(&rr[i]);
if is_qc_fail {
f |= bam_fields::flags::QC_FAIL;
} else {
f &= !bam_fields::flags::QC_FAIL;
}
bam_fields::set_flags(&mut rr[i], f);
}
}
let rr = mapped.raw_records.as_mut().unwrap();
for record in rr.iter_mut() {
bam_fields::normalize_int_tag_to_smallest_signed(record, b"AS");
bam_fields::normalize_int_tag_to_smallest_signed(record, b"XS");
}
if !skip_pa_tags {
add_primary_alignment_tags_raw(mapped);
}
Ok(())
}
fn reverse_tag_in_place_raw(
record: &mut [u8],
aux_offset: usize,
tag_bytes: [u8; 2],
value: &BufValue,
) {
match value {
BufValue::String(_) => {
bam_fields::reverse_string_tag_in_place(record, aux_offset, &tag_bytes);
}
BufValue::Array(_) => {
bam_fields::reverse_array_tag_in_place(record, aux_offset, &tag_bytes);
}
_ => {}
}
}
fn revcomp_tag_in_place_raw(
record: &mut [u8],
aux_offset: usize,
tag_bytes: [u8; 2],
value: &BufValue,
) {
match value {
BufValue::String(_) => {
bam_fields::reverse_complement_string_tag_in_place(record, aux_offset, &tag_bytes);
}
BufValue::Array(_) => {
bam_fields::reverse_array_tag_in_place(record, aux_offset, &tag_bytes);
}
_ => {}
}
}
fn encode_unmapped_template_records(template: &Template, header: &Header) -> Result<Vec<Vec<u8>>> {
let mut raw = Vec::with_capacity(template.read_count());
for record in template.all_reads() {
let mut buf = Vec::with_capacity(256);
encode_record_buf(&mut buf, header, record)
.map_err(|e| anyhow::anyhow!("encode unmapped: {e}"))?;
raw.push(buf);
}
Ok(raw)
}
impl Zipper {
fn process_raw<U, M>(
&self,
unmapped_iter: U,
mut mapped_iter: M,
output_header: &Header,
tag_info: &TagInfo,
) -> Result<u64>
where
U: Iterator<Item = Result<Template>>,
M: Iterator<Item = Result<Template>>,
{
let mut writer = create_raw_bam_writer(
&self.output,
output_header,
self.threads,
self.compression.compression_level,
)?;
let progress = ProgressTracker::new("Processed records").with_interval(1_000_000);
let mut mapped_peek: Option<Template> = None;
let mut templates_not_in_mapped_bam: u64 = 0;
for unmapped_result in unmapped_iter {
let unmapped_template = unmapped_result?;
if mapped_peek.is_none() {
mapped_peek = mapped_iter.next().transpose()?;
}
if let Some(ref mut mapped_template) = mapped_peek {
if mapped_template.name == unmapped_template.name {
convert_template_to_raw(mapped_template, output_header)?;
merge_raw(&unmapped_template, mapped_template, tag_info, self.skip_pa_tags)?;
let rr = mapped_template.raw_records.as_ref().unwrap();
for rec in rr {
writer.write_raw_record(rec)?;
progress.log_if_needed(1);
}
mapped_peek = None;
} else {
debug!(
"Found unmapped read with no corresponding mapped \
read: {}",
String::from_utf8_lossy(&unmapped_template.name)
);
if self.exclude_missing_reads {
templates_not_in_mapped_bam += 1;
} else {
let raw =
encode_unmapped_template_records(&unmapped_template, output_header)?;
for rec in &raw {
writer.write_raw_record(rec)?;
progress.log_if_needed(1);
}
}
}
} else {
debug!(
"Found unmapped read with no corresponding mapped \
read: {}",
String::from_utf8_lossy(&unmapped_template.name)
);
if self.exclude_missing_reads {
templates_not_in_mapped_bam += 1;
} else {
let raw = encode_unmapped_template_records(&unmapped_template, output_header)?;
for rec in &raw {
writer.write_raw_record(rec)?;
progress.log_if_needed(1);
}
}
}
}
progress.log_final();
if let Some(remaining) = mapped_peek {
anyhow::bail!(
"Error: processed all unmapped reads but there are mapped \
reads remaining. Found template '{}'. Please ensure the \
unmapped and mapped reads have the same set of read names \
in the same order, and reads with the same name are \
consecutive (grouped) in each input.",
String::from_utf8_lossy(&remaining.name)
);
}
if let Some(remaining) = mapped_iter.next() {
let remaining = remaining?;
anyhow::bail!(
"Error: processed all unmapped reads but there are mapped \
reads remaining. Found template '{}'. Please ensure the \
unmapped and mapped reads have the same set of read names \
in the same order, and reads with the same name are \
consecutive (grouped) in each input.",
String::from_utf8_lossy(&remaining.name)
);
}
if self.exclude_missing_reads && templates_not_in_mapped_bam > 0 {
info!(
"Excluded {templates_not_in_mapped_bam} templates that \
were not present in the aligned BAM."
);
}
writer.finish()?;
Ok(progress.count())
}
}
enum MappedReader {
Sam(noodles::sam::io::Reader<Box<dyn BufRead + Send>>),
Bam(BamReaderAuto),
}
impl Command for Zipper {
fn execute(&self, command_line: &str) -> Result<()> {
info!("Starting zipper");
let timer = OperationTimer::new("Zipping BAMs");
validate_file_exists(&self.unmapped, "unmapped BAM file")?;
validate_file_exists(&self.reference, "reference FASTA file")?;
let dict_path = find_dict_path(&self.reference).ok_or_else(|| {
anyhow::anyhow!(
"Reference dictionary file not found. Tried:\n \
- {}\n \
- {}.dict\n\
Please run: samtools dict {} -o {}",
self.reference.with_extension("dict").display(),
self.reference.display(),
self.reference.display(),
self.reference.with_extension("dict").display()
)
})?;
let (mut unmapped_reader, unmapped_header) = create_bam_reader(&self.unmapped, 1)?;
let (mapped_reader, mapped_header) = if is_stdin_path(&self.input) {
info!("Reading SAM from stdin with adaptive buffer (bwa -K {})", self.bwa_chunk_size);
let reader: Box<dyn BufRead + Send> =
Box::new(BatchedSamReader::new(std::io::stdin(), self.bwa_chunk_size));
let mut sam_reader = noodles::sam::io::Reader::new(reader);
let header = sam_reader.read_header()?;
(MappedReader::Sam(sam_reader), header)
} else if self.input.extension().is_some_and(|ext| ext.eq_ignore_ascii_case("bam")) {
warn!(
"BAM input detected for --input. For best performance, pipe SAM directly \
from the aligner (e.g. bwa mem ... | fgumi zipper ...)."
);
let (bam_reader, header) = create_bam_reader(&self.input, self.threads)?;
(MappedReader::Bam(bam_reader), header)
} else {
let reader: Box<dyn BufRead + Send> = Box::new(BufReader::with_capacity(
256 * 1024,
std::fs::File::open(&self.input).context("Failed to open mapped SAM")?,
));
let mut sam_reader = noodles::sam::io::Reader::new(reader);
let header = sam_reader.read_header()?;
(MappedReader::Sam(sam_reader), header)
};
check_sort(&unmapped_header, &self.unmapped, "unmapped");
check_sort(&mapped_header, &self.input, "mapped");
let output_header = build_output_header(&unmapped_header, &mapped_header, &dict_path)?;
let output_header = crate::commands::common::add_pg_record(output_header, command_line)?;
let tag_info = TagInfo::new(
self.tags_to_remove.clone(),
self.tags_to_reverse.clone(),
self.tags_to_revcomp.clone(),
);
if !tag_info.remove.is_empty() {
info!("Tags for removal: {:?}", tag_info.remove);
}
if !tag_info.reverse.is_empty() {
info!("Tags being reversed: {:?}", tag_info.reverse);
}
if !tag_info.revcomp.is_empty() {
info!("Tags being reverse complemented: {:?}", tag_info.revcomp);
}
let (unmapped_tx, unmapped_rx) =
std::sync::mpsc::sync_channel::<Result<Template>>(self.buffer);
let unmapped_header_for_reader = unmapped_header.clone();
std::thread::spawn(move || {
let unmapped_iter = TemplateIterator::new(
unmapped_reader
.record_bufs(&unmapped_header_for_reader)
.map(|r| r.map_err(anyhow::Error::from)),
);
for template in unmapped_iter {
if unmapped_tx.send(template).is_err() {
break; }
}
});
let unmapped_iter = std::iter::from_fn(move || unmapped_rx.recv().ok());
let (mapped_tx, mapped_rx) = std::sync::mpsc::sync_channel::<Result<Template>>(self.buffer);
let mapped_header_for_reader = mapped_header.clone();
std::thread::spawn(move || {
match mapped_reader {
MappedReader::Sam(mut r) => {
let mapped_iter = TemplateIterator::new(
r.record_bufs(&mapped_header_for_reader)
.map(|rec| rec.map_err(anyhow::Error::from)),
);
for template in mapped_iter {
if mapped_tx.send(template).is_err() {
break;
}
}
}
MappedReader::Bam(mut r) => {
let mapped_iter = TemplateIterator::new(
r.record_bufs(&mapped_header_for_reader)
.map(|rec| rec.map_err(anyhow::Error::from)),
);
for template in mapped_iter {
if mapped_tx.send(template).is_err() {
break;
}
}
}
}
});
let mapped_iter = std::iter::from_fn(move || mapped_rx.recv().ok());
let total_records =
self.process_raw(unmapped_iter, mapped_iter, &output_header, &tag_info)?;
info!("zipper completed successfully");
timer.log_completion(total_records);
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::sam::builder::{
MAPPED_PG_ID, REFERENCE_LENGTH, RecordBuilder, SamBuilder, create_ref_dict,
};
use anyhow::Result;
use bstr::ByteSlice;
use noodles::sam::alignment::record::Flags;
use noodles::sam::alignment::record::data::field::Tag;
use noodles::sam::alignment::record_buf::RecordBuf;
use noodles::sam::alignment::record_buf::data::field::Value as BufValue;
use rstest::rstest;
use std::collections::HashMap;
use tempfile::TempDir;
fn run_zipper(
unmapped: &SamBuilder,
mapped: &SamBuilder,
tags_to_remove: Vec<String>,
tags_to_reverse: Vec<String>,
tags_to_revcomp: Vec<String>,
) -> Result<Vec<RecordBuf>> {
let dir = TempDir::new()?;
let unmapped_path = dir.path().join("unmapped.bam");
let mapped_path = dir.path().join("mapped.sam");
let output_path = dir.path().join("output.bam");
let dict_path = create_ref_dict(&dir, "chr1", REFERENCE_LENGTH)?;
unmapped.write(&unmapped_path)?;
mapped.write_sam(&mapped_path)?;
let zipper = Zipper {
input: mapped_path.clone(),
unmapped: unmapped_path.clone(),
reference: dict_path,
output: output_path.clone(),
tags_to_remove,
tags_to_reverse,
tags_to_revcomp,
buffer: 5000,
threads: 1,
compression: CompressionOptions { compression_level: 1 },
bwa_chunk_size: 150_000_000,
exclude_missing_reads: false,
skip_pa_tags: false,
};
zipper.execute("test")?;
read_bam_records(&output_path)
}
fn read_bam_records(path: &std::path::Path) -> Result<Vec<RecordBuf>> {
let mut reader = noodles::bam::io::reader::Builder.build_from_path(path)?;
let header = reader.read_header()?;
Ok(reader.record_bufs(&header).collect::<std::io::Result<Vec<_>>>()?)
}
#[test]
fn test_basic_merge() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs1 = HashMap::new();
attrs1.insert("RX", BufValue::from("ACGT".to_string()));
attrs1.insert("xy", BufValue::from(1234i32));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs1);
let mut attrs2 = HashMap::new();
attrs2.insert("RX", BufValue::from("GGTA".to_string()));
attrs2.insert("xy", BufValue::from(4567i32));
unmapped.add_pair_with_attrs("q2", None, None, true, true, &attrs2);
let mut mapped_attrs1 = HashMap::new();
mapped_attrs1.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs1.insert("AS", BufValue::from(77i32));
mapped.add_pair_with_attrs("q1", Some(100), Some(200), true, true, &mapped_attrs1);
let mut mapped_attrs2 = HashMap::new();
mapped_attrs2.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs2.insert("AS", BufValue::from(16i32));
mapped.add_pair_with_attrs("q2", Some(500), Some(700), true, true, &mapped_attrs2);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 4);
for rec in &records {
assert!(rec.data().get(&Tag::from(SamTag::RX)).is_some());
assert!(rec.data().get(&Tag::new(b'x', b'y')).is_some());
assert!(rec.data().get(&Tag::from(SamTag::AS)).is_some());
assert!(rec.data().get(&Tag::from(SamTag::PG)).is_some());
assert!(rec.data().get(&Tag::from(SamTag::MC)).is_some(), "MC tag should be present");
assert!(rec.data().get(&Tag::from(SamTag::MQ)).is_some(), "MQ tag should be present");
}
Ok(())
}
#[test]
fn test_unpaired_reads() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs1 = HashMap::new();
attrs1.insert("RX", BufValue::from("ACGT".to_string()));
attrs1.insert("xy", BufValue::from(1234i32));
unmapped.add_frag_with_attrs("q1", None, true, &attrs1);
let mut attrs2 = HashMap::new();
attrs2.insert("RX", BufValue::from("GGTA".to_string()));
attrs2.insert("xy", BufValue::from(4567i32));
unmapped.add_frag_with_attrs("q2", None, true, &attrs2);
let mut mapped_attrs1 = HashMap::new();
mapped_attrs1.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs1.insert("AS", BufValue::from(77i32));
mapped.add_frag_with_attrs("q1", Some(100), true, &mapped_attrs1);
let mut mapped_attrs2 = HashMap::new();
mapped_attrs2.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs2.insert("AS", BufValue::from(16i32));
mapped.add_frag_with_attrs("q2", Some(500), false, &mapped_attrs2);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 2);
for rec in &records {
assert!(rec.data().get(&Tag::from(SamTag::RX)).is_some());
assert!(rec.data().get(&Tag::new(b'x', b'y')).is_some());
assert!(rec.data().get(&Tag::from(SamTag::AS)).is_some());
}
Ok(())
}
#[test]
fn test_tag_removal_reverse_revcomp() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("n1", BufValue::from(vec![1i16, 2, 3, 4, 5]));
attrs.insert("n2", BufValue::from(vec![2i16, 3, 4, 5, 6]));
attrs.insert("s1", BufValue::from("abcde".to_string()));
attrs.insert("s2", BufValue::from("vwxyz".to_string()));
attrs.insert("s3", BufValue::from("AGAGG".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs.insert("AS", BufValue::from(77i32));
mapped.add_pair_with_attrs("q1", Some(100), Some(200), true, false, &mapped_attrs);
let records = run_zipper(
&unmapped,
&mapped,
vec!["AS".to_string()],
vec!["n1".to_string(), "n2".to_string(), "s1".to_string()],
vec!["s3".to_string()],
)?;
assert_eq!(records.len(), 2);
for rec in &records {
assert!(rec.data().get(&Tag::from(SamTag::AS)).is_none());
if rec.flags().is_first_segment() {
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = rec.data().get(&Tag::new(b'n', b'1'))
{
assert_eq!(*vals, vec![1i16, 2, 3, 4, 5]);
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'1')) {
assert_eq!(s.to_string(), "abcde");
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'3')) {
assert_eq!(s.to_string(), "AGAGG");
}
} else {
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = rec.data().get(&Tag::new(b'n', b'1'))
{
assert_eq!(*vals, vec![5i16, 4, 3, 2, 1]);
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'1')) {
assert_eq!(s.to_string(), "edcba");
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'3')) {
assert_eq!(s.to_string(), "CCTCT");
}
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'2')) {
assert_eq!(s.to_string(), "vwxyz");
}
}
Ok(())
}
#[test]
fn test_consensus_tag_set() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("aD", BufValue::from("AAAGG".to_string()));
attrs.insert("bD", BufValue::from("AAAGC".to_string()));
attrs.insert("ad", BufValue::from(vec![3i16, 3, 4, 4, 2]));
unmapped.add_frag_with_attrs("q1", None, true, &attrs);
let mut mapped_attrs1 = HashMap::new();
mapped_attrs1.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs1.insert("AS", BufValue::from(77i32));
mapped.add_frag_with_attrs("q1", Some(100), true, &mapped_attrs1.clone());
let mut supp_rec = RecordBuilder::new()
.name("q1")
.flags(Flags::SUPPLEMENTARY | Flags::REVERSE_COMPLEMENTED)
.reference_sequence_id(0)
.alignment_start(200)
.cigar("100M")
.sequence(&"A".repeat(100))
.qualities(&[30u8; 100])
.build();
for (tag_str, value) in &mapped_attrs1 {
let tag = Tag::new(tag_str.as_bytes()[0], tag_str.as_bytes()[1]);
supp_rec.data_mut().insert(tag, value.clone());
}
mapped.push_record(supp_rec);
let records = run_zipper(
&unmapped,
&mapped,
vec![],
vec!["Consensus".to_string()],
vec!["Consensus".to_string()],
)?;
assert_eq!(records.len(), 2);
for rec in &records {
if rec.flags().is_reverse_complemented() {
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b'a', b'D')) {
assert_eq!(s.to_string(), "CCTTT");
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b'b', b'D')) {
assert_eq!(s.to_string(), "GCTTT");
}
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = rec.data().get(&Tag::new(b'a', b'd'))
{
assert_eq!(*vals, vec![2i16, 4, 4, 3, 3]);
}
} else {
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b'a', b'D')) {
assert_eq!(s.to_string(), "AAAGG");
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b'b', b'D')) {
assert_eq!(s.to_string(), "AAAGC");
}
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = rec.data().get(&Tag::new(b'a', b'd'))
{
assert_eq!(*vals, vec![3i16, 3, 4, 4, 2]);
}
}
}
Ok(())
}
#[test]
fn test_unmapped_only_reads() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs1 = HashMap::new();
attrs1.insert("RX", BufValue::from("ACGT".to_string()));
attrs1.insert("xy", BufValue::from(1234i32));
unmapped.add_frag_with_attrs("q1", None, true, &attrs1);
let mut attrs2 = HashMap::new();
attrs2.insert("RX", BufValue::from("GATA".to_string()));
attrs2.insert("xy", BufValue::from(3456i32));
unmapped.add_frag_with_attrs("q2", None, true, &attrs2);
let mut attrs3 = HashMap::new();
attrs3.insert("RX", BufValue::from("GGCG".to_string()));
attrs3.insert("xy", BufValue::from(5678i32));
unmapped.add_frag_with_attrs("q3", None, true, &attrs3);
let mut mapped_attrs1 = HashMap::new();
mapped_attrs1.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs1.insert("AS", BufValue::from(77i32));
mapped.add_frag_with_attrs("q1", Some(100), true, &mapped_attrs1);
let mut mapped_attrs3 = HashMap::new();
mapped_attrs3.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs3.insert("AS", BufValue::from(77i32));
mapped.add_frag_with_attrs("q3", Some(200), false, &mapped_attrs3);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 3);
let names: Vec<String> = records
.iter()
.map(|r| {
String::from_utf8(r.name().expect("record should have a name").as_bytes().to_vec())
.expect("record name should be valid UTF-8")
})
.collect();
assert_eq!(names, vec!["q1", "q2", "q3"]);
let q2 = records
.iter()
.find(|r| {
String::from_utf8(r.name().expect("record should have a name").as_bytes().to_vec())
.expect("record name should be valid UTF-8")
== "q2"
})
.expect("expected record q2 not found");
assert!(q2.flags().is_unmapped());
Ok(())
}
#[test]
fn test_tag_info_expansion() {
let tag_info =
TagInfo::new(vec![], vec!["Consensus".to_string()], vec!["Consensus".to_string()]);
assert!(tag_info.reverse.contains("ad"));
assert!(tag_info.reverse.contains("ae"));
assert!(tag_info.reverse.contains("bd"));
assert!(tag_info.reverse.contains("be"));
assert!(tag_info.reverse.contains("cd"));
assert!(tag_info.revcomp.contains("aD"));
assert!(tag_info.revcomp.contains("bD"));
assert!(tag_info.revcomp.contains("cD"));
}
#[test]
fn test_mate_info_fixing() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs.insert("AS", BufValue::from(77i32));
mapped.add_pair_with_attrs("q1", Some(100), Some(200), true, true, &mapped_attrs);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 2);
let r1 = records
.iter()
.find(|r| r.flags().is_first_segment())
.expect("expected R1 record not found");
let r2 = records
.iter()
.find(|r| !r.flags().is_first_segment())
.expect("expected R2 record not found");
let r1_mq = r1.data().get(&Tag::from(SamTag::MQ));
let r2_mq = r2.data().get(&Tag::from(SamTag::MQ));
assert!(r1_mq.is_some(), "R1 should have MQ tag");
assert!(r2_mq.is_some(), "R2 should have MQ tag");
let r1_mc = r1.data().get(&Tag::from(SamTag::MC));
let r2_mc = r2.data().get(&Tag::from(SamTag::MC));
assert!(r1_mc.is_some(), "R1 should have MC tag");
assert!(r2_mc.is_some(), "R2 should have MC tag");
Ok(())
}
#[test]
fn test_qc_flag_transfer() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs1 = HashMap::new();
attrs1.insert("RX", BufValue::from("ACGT".to_string()));
let r1_unmapped = RecordBuilder::new()
.name("q1")
.flags(Flags::SEGMENTED | Flags::FIRST_SEGMENT | Flags::UNMAPPED)
.tag("RX", "ACGT")
.build();
let r2_unmapped = RecordBuilder::new()
.name("q1")
.flags(Flags::SEGMENTED | Flags::LAST_SEGMENT | Flags::UNMAPPED | Flags::QC_FAIL)
.tag("RX", "ACGT")
.build();
unmapped.push_record(r1_unmapped);
unmapped.push_record(r2_unmapped);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs.insert("AS", BufValue::from(77i32));
mapped.add_pair_with_attrs("q1", Some(100), Some(200), true, true, &mapped_attrs);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 2);
let r1 = records
.iter()
.find(|r| r.flags().is_first_segment())
.expect("expected R1 record not found");
let r2 = records
.iter()
.find(|r| !r.flags().is_first_segment())
.expect("expected R2 record not found");
assert!(
!r1.flags().is_qc_fail(),
"R1 should not have QC_FAIL flag (passed QC in unmapped)"
);
assert!(r2.flags().is_qc_fail(), "R2 should have QC_FAIL flag (failed QC in unmapped)");
Ok(())
}
#[test]
fn test_qc_flag_removal() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_frag_with_attrs("q1", None, true, &attrs);
let mapped_rec = RecordBuilder::new()
.name("q1")
.flags(Flags::QC_FAIL)
.reference_sequence_id(0)
.alignment_start(100)
.cigar("100M")
.sequence(&"A".repeat(100))
.qualities(&[30u8; 100])
.build();
mapped.push_record(mapped_rec);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 1);
assert!(
!records[0].flags().is_qc_fail(),
"QC_FAIL flag should be removed when unmapped read passes QC"
);
Ok(())
}
#[test]
#[allow(clippy::too_many_lines)]
fn test_tag_operations_on_secondary_and_supplementary() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("n1", BufValue::from(vec![1i16, 2, 3, 4, 5]));
attrs.insert("n2", BufValue::from(vec![2i16, 3, 4, 5, 6]));
attrs.insert("s1", BufValue::from("abcde".to_string()));
attrs.insert("s2", BufValue::from("vwxyz".to_string()));
attrs.insert("s3", BufValue::from("AGAGG".to_string()));
unmapped.add_frag_with_attrs("q1", None, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs.insert("AS", BufValue::from(77i32));
mapped.add_frag_with_attrs("q1", Some(100), true, &mapped_attrs.clone());
let mut secondary_rec = RecordBuilder::new()
.name("q1")
.flags(Flags::SECONDARY | Flags::REVERSE_COMPLEMENTED)
.reference_sequence_id(0)
.alignment_start(200)
.cigar("100M")
.sequence(&"A".repeat(100))
.qualities(&[30u8; 100])
.build();
for (tag_str, value) in &mapped_attrs {
let tag = Tag::new(tag_str.as_bytes()[0], tag_str.as_bytes()[1]);
secondary_rec.data_mut().insert(tag, value.clone());
}
mapped.push_record(secondary_rec);
let mut supp_rec = RecordBuilder::new()
.name("q1")
.flags(Flags::SUPPLEMENTARY | Flags::REVERSE_COMPLEMENTED)
.reference_sequence_id(0)
.alignment_start(300)
.cigar("100M")
.sequence(&"A".repeat(100))
.qualities(&[30u8; 100])
.build();
for (tag_str, value) in &mapped_attrs {
let tag = Tag::new(tag_str.as_bytes()[0], tag_str.as_bytes()[1]);
supp_rec.data_mut().insert(tag, value.clone());
}
mapped.push_record(supp_rec);
let records = run_zipper(
&unmapped,
&mapped,
vec!["AS".to_string()],
vec!["n1".to_string(), "n2".to_string(), "s1".to_string()],
vec!["s3".to_string()],
)?;
assert_eq!(records.len(), 3, "Should have 3 records (primary, secondary, supplementary)");
let secondary_and_supp_count = records
.iter()
.filter(|r| r.flags().is_secondary() || r.flags().is_supplementary())
.count();
assert_eq!(secondary_and_supp_count, 2, "Should have 2 secondary/supplementary records");
let neg_strand_count =
records.iter().filter(|r| r.flags().is_reverse_complemented()).count();
assert_eq!(neg_strand_count, 2, "Should have 2 negative strand records");
for rec in &records {
assert!(rec.data().get(&Tag::from(SamTag::AS)).is_none(), "AS tag should be removed");
if rec.flags().is_reverse_complemented() {
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = rec.data().get(&Tag::new(b'n', b'1'))
{
assert_eq!(
*vals,
vec![5i16, 4, 3, 2, 1],
"n1 should be reversed for negative strand"
);
}
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = rec.data().get(&Tag::new(b'n', b'2'))
{
assert_eq!(
*vals,
vec![6i16, 5, 4, 3, 2],
"n2 should be reversed for negative strand"
);
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'1')) {
assert_eq!(s.to_string(), "edcba", "s1 should be reversed for negative strand");
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'2')) {
assert_eq!(s.to_string(), "vwxyz", "s2 should not be changed");
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'3')) {
assert_eq!(
s.to_string(),
"CCTCT",
"s3 should be revcomped for negative strand"
);
}
} else {
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = rec.data().get(&Tag::new(b'n', b'1'))
{
assert_eq!(
*vals,
vec![1i16, 2, 3, 4, 5],
"n1 should be unchanged for positive strand"
);
}
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = rec.data().get(&Tag::new(b'n', b'2'))
{
assert_eq!(
*vals,
vec![2i16, 3, 4, 5, 6],
"n2 should be unchanged for positive strand"
);
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'1')) {
assert_eq!(
s.to_string(),
"abcde",
"s1 should be unchanged for positive strand"
);
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'2')) {
assert_eq!(s.to_string(), "vwxyz", "s2 should not be changed");
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'3')) {
assert_eq!(
s.to_string(),
"AGAGG",
"s3 should be unchanged for positive strand"
);
}
}
}
Ok(())
}
#[test]
fn test_multithreaded_processing() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
for i in 0..10 {
let name = format!("q{i}");
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from(format!("ACG{i}")));
attrs.insert("xy", BufValue::from(1000 + i as i32));
unmapped.add_pair_with_attrs(&name, None, None, true, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs.insert("AS", BufValue::from(77i32));
mapped.add_pair_with_attrs(
&name,
Some(100 + i * 100),
Some(200 + i * 100),
true,
true,
&mapped_attrs,
);
}
let dir = TempDir::new()?;
let unmapped_path = dir.path().join("unmapped.bam");
let mapped_path = dir.path().join("mapped.sam");
let output_path = dir.path().join("output.bam");
let dict_path = create_ref_dict(&dir, "chr1", REFERENCE_LENGTH)?;
unmapped.write(&unmapped_path)?;
mapped.write_sam(&mapped_path)?;
let zipper = Zipper {
input: mapped_path.clone(),
unmapped: unmapped_path.clone(),
reference: dict_path,
output: output_path.clone(),
tags_to_remove: vec![],
tags_to_reverse: vec![],
tags_to_revcomp: vec![],
buffer: 5000,
threads: 4,
compression: CompressionOptions { compression_level: 1 },
bwa_chunk_size: 150_000_000,
exclude_missing_reads: false,
skip_pa_tags: false,
};
zipper.execute("test")?;
let records = read_bam_records(&output_path)?;
assert_eq!(records.len(), 20);
for rec in &records {
assert!(rec.data().get(&Tag::from(SamTag::RX)).is_some());
assert!(rec.data().get(&Tag::new(b'x', b'y')).is_some());
assert!(rec.data().get(&Tag::from(SamTag::AS)).is_some());
}
Ok(())
}
#[test]
fn test_empty_unmapped() -> Result<()> {
let unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped.add_frag_with_attrs("q1", Some(100), true, &mapped_attrs);
let result = run_zipper(&unmapped, &mapped, vec![], vec![], vec![]);
assert!(result.is_err());
let err_msg = result.unwrap_err().to_string();
assert!(
err_msg.contains("mapped reads remaining"),
"Expected error about leftover mapped reads, got: {err_msg}"
);
Ok(())
}
#[test]
fn test_copy_tags_with_pg_present() -> Result<()> {
let src =
RecordBuilder::new().sequence("ACGT").tag("PG", "old_pg").tag("XY", 123i32).build();
let mut dest_data = Data::default();
dest_data.insert(Tag::from(SamTag::PG), BufValue::from("new_pg".to_string()));
let tag_info = TagInfo::new(vec![], vec![], vec![]);
copy_tags(&src, &mut dest_data, &tag_info)?;
if let Some(BufValue::String(s)) = dest_data.get(&Tag::from(SamTag::PG)) {
assert_eq!(s.to_string(), "new_pg");
}
assert!(dest_data.get(&Tag::new(b'X', b'Y')).is_some());
Ok(())
}
#[test]
fn test_copy_tags_with_removal() -> Result<()> {
let src = RecordBuilder::new().sequence("ACGT").tag("XY", 123i32).tag("AB", 456i32).build();
let mut dest_data = Data::default();
let tag_info = TagInfo::new(vec!["XY".to_string()], vec![], vec![]);
copy_tags(&src, &mut dest_data, &tag_info)?;
assert!(dest_data.get(&Tag::new(b'X', b'Y')).is_none());
assert!(dest_data.get(&Tag::new(b'A', b'B')).is_some());
Ok(())
}
#[test]
fn test_merge_mixed_strands() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
attrs.insert("xy", BufValue::from(1234i32));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped.add_pair_with_attrs("q1", Some(100), Some(200), true, true, &mapped_attrs);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 2);
for rec in &records {
assert!(rec.data().get(&Tag::from(SamTag::RX)).is_some());
assert!(rec.data().get(&Tag::new(b'x', b'y')).is_some());
}
Ok(())
}
#[test]
fn test_tag_info_has_revs_or_revcomps() {
let tag_info1 = TagInfo::new(vec![], vec![], vec![]);
assert!(!tag_info1.has_revs_or_revcomps());
let tag_info2 = TagInfo::new(vec![], vec!["n1".to_string()], vec![]);
assert!(tag_info2.has_revs_or_revcomps());
let tag_info3 = TagInfo::new(vec![], vec![], vec!["s1".to_string()]);
assert!(tag_info3.has_revs_or_revcomps());
let tag_info4 = TagInfo::new(vec![], vec!["n1".to_string()], vec!["s1".to_string()]);
assert!(tag_info4.has_revs_or_revcomps());
}
#[test]
fn test_multiple_secondary_alignments() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_frag_with_attrs("q1", None, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped.add_frag_with_attrs("q1", Some(100), true, &mapped_attrs.clone());
for i in 0..3 {
let mut secondary_rec = RecordBuilder::new()
.name("q1")
.flags(Flags::SECONDARY)
.reference_sequence_id(0)
.alignment_start(200 + i * 100)
.cigar("100M")
.sequence(&"A".repeat(100))
.qualities(&[30u8; 100])
.build();
for (tag_str, value) in &mapped_attrs {
let tag = Tag::new(tag_str.as_bytes()[0], tag_str.as_bytes()[1]);
secondary_rec.data_mut().insert(tag, value.clone());
}
mapped.push_record(secondary_rec);
}
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 4, "Should have 1 primary + 3 secondary alignments");
for rec in &records {
assert!(rec.data().get(&Tag::from(SamTag::RX)).is_some());
}
Ok(())
}
#[test]
fn test_r2_only_mapping() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let r2_mapped = RecordBuilder::new()
.name("q1")
.flags(Flags::SEGMENTED | Flags::LAST_SEGMENT)
.reference_sequence_id(0)
.alignment_start(200)
.cigar("100M")
.sequence(&"A".repeat(100))
.qualities(&[30u8; 100])
.tag("PG", MAPPED_PG_ID)
.build();
mapped.push_record(r2_mapped);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 1, "Should have only R2 in output");
let r2 = &records[0];
assert!(!r2.flags().is_first_segment());
assert!(r2.data().get(&Tag::from(SamTag::RX)).is_some());
Ok(())
}
fn run_zipper_with_options(
unmapped: &SamBuilder,
mapped: &SamBuilder,
exclude_missing_reads: bool,
) -> Result<Vec<RecordBuf>> {
let dir = TempDir::new()?;
let unmapped_path = dir.path().join("unmapped.bam");
let mapped_path = dir.path().join("mapped.sam");
let output_path = dir.path().join("output.bam");
let dict_path = create_ref_dict(&dir, "chr1", REFERENCE_LENGTH)?;
unmapped.write(&unmapped_path)?;
mapped.write_sam(&mapped_path)?;
let zipper = Zipper {
input: mapped_path.clone(),
unmapped: unmapped_path.clone(),
reference: dict_path,
output: output_path.clone(),
tags_to_remove: vec![],
tags_to_reverse: vec![],
tags_to_revcomp: vec![],
buffer: 5000,
threads: 1,
compression: CompressionOptions { compression_level: 1 },
bwa_chunk_size: 150_000_000,
exclude_missing_reads,
skip_pa_tags: false,
};
zipper.execute("test")?;
read_bam_records(&output_path)
}
#[test]
fn test_both_inputs_empty() -> Result<()> {
let unmapped = SamBuilder::new_unmapped();
let mapped = SamBuilder::new_mapped();
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 0);
Ok(())
}
#[test]
fn test_exclude_missing_reads() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
unmapped.add_pair_with_attrs("q2", None, None, true, true, &attrs);
unmapped.add_pair_with_attrs("q3", None, None, true, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped.add_pair_with_attrs("q1", Some(100), Some(200), true, true, &mapped_attrs);
mapped.add_pair_with_attrs("q3", Some(300), Some(400), true, true, &mapped_attrs);
let records_with_missing = run_zipper_with_options(&unmapped, &mapped, false)?;
assert_eq!(records_with_missing.len(), 6, "Should have 6 records (3 pairs)");
let records_without_missing = run_zipper_with_options(&unmapped, &mapped, true)?;
assert_eq!(records_without_missing.len(), 4, "Should have 4 records (2 pairs)");
for rec in &records_without_missing {
if let Some(name) = rec.name() {
let name_str = String::from_utf8_lossy(name.as_ref());
assert_ne!(name_str, "q2", "q2 should be excluded");
}
}
Ok(())
}
#[test]
fn test_file_not_found_error() -> Result<()> {
let dir = TempDir::new()?;
let unmapped_path = dir.path().join("nonexistent_unmapped.bam");
let mapped_path = dir.path().join("mapped.sam");
let output_path = dir.path().join("output.bam");
let mapped = SamBuilder::new_mapped();
mapped.write_sam(&mapped_path)?;
let zipper = Zipper {
input: mapped_path,
unmapped: unmapped_path.clone(),
reference: dir.path().join("ref.fa"),
output: output_path,
tags_to_remove: vec![],
tags_to_reverse: vec![],
tags_to_revcomp: vec![],
buffer: 5000,
threads: 1,
compression: CompressionOptions { compression_level: 1 },
bwa_chunk_size: 150_000_000,
exclude_missing_reads: false,
skip_pa_tags: false,
};
let result = zipper.execute("test");
assert!(result.is_err(), "Should error when reference file doesn't exist");
let err_msg = result.unwrap_err().to_string();
assert!(
err_msg.contains("does not exist") || err_msg.contains("not found"),
"Error should mention file not found: {err_msg}"
);
Ok(())
}
const FLAG_PAIRED: u16 = 0x1;
const FLAG_READ1: u16 = 0x40;
const FLAG_READ2: u16 = 0x80;
const FLAG_SECONDARY: u16 = 0x100;
const FLAG_SUPPLEMENTARY: u16 = 0x800;
const FLAG_REVERSE: u16 = 0x10;
#[test]
fn test_add_pa_tag_to_supplementary_r1() -> Result<()> {
use crate::template::Template;
let primary_r1 = RecordBuilder::new()
.name("q1")
.sequence("ACGTACGT")
.qualities(&[30; 8])
.reference_sequence_id(0)
.alignment_start(100)
.cigar("8M")
.flags(Flags::from(FLAG_PAIRED | FLAG_READ1))
.build();
let supplementary_r1 = RecordBuilder::new()
.name("q1")
.sequence("ACGTACGT")
.qualities(&[30; 8])
.reference_sequence_id(5)
.alignment_start(5000)
.cigar("8M")
.flags(Flags::from(FLAG_PAIRED | FLAG_READ1 | FLAG_SUPPLEMENTARY))
.build();
let mut template = Template::from_records(vec![primary_r1, supplementary_r1])?;
add_primary_alignment_tags(&mut template);
let supp = template
.records
.iter()
.find(|r| r.flags().is_supplementary())
.expect("Should have supplementary");
let pa_value = supp.data().get(&PA_TAG).expect("Supplementary should have pa tag");
let pa_info =
PrimaryAlignmentInfo::from_tag_value(pa_value).expect("Should be able to parse pa tag");
assert_eq!(pa_info.tid1, 0, "tid1 should be R1's reference");
assert_eq!(pa_info.pos1, 100, "pos1 should be R1's position");
assert!(!pa_info.neg1, "neg1 should be false (forward strand)");
assert_eq!(pa_info.tid2, 0, "tid2 should equal tid1 (single read)");
assert_eq!(pa_info.pos2, 100, "pos2 should equal pos1 (single read)");
assert!(!pa_info.neg2, "neg2 should equal neg1 (single read)");
let primary = template.r1().expect("Should have primary R1");
assert!(primary.data().get(&PA_TAG).is_none(), "Primary should not have pa tag");
Ok(())
}
#[test]
fn test_add_pa_tag_to_secondary_reverse_strand() -> Result<()> {
use crate::template::Template;
let primary_r1 = RecordBuilder::new()
.name("q1")
.sequence("ACGTACGT")
.qualities(&[30; 8])
.reference_sequence_id(0)
.alignment_start(200)
.cigar("8M")
.flags(Flags::from(FLAG_PAIRED | FLAG_READ1 | FLAG_REVERSE))
.build();
let secondary_r1 = RecordBuilder::new()
.name("q1")
.sequence("ACGTACGT")
.qualities(&[30; 8])
.reference_sequence_id(3)
.alignment_start(3000)
.cigar("8M")
.flags(Flags::from(FLAG_PAIRED | FLAG_READ1 | FLAG_SECONDARY))
.build();
let mut template = Template::from_records(vec![primary_r1, secondary_r1])?;
add_primary_alignment_tags(&mut template);
let secondary = template
.records
.iter()
.find(|r| r.flags().is_secondary())
.expect("Should have secondary");
let pa_value = secondary.data().get(&PA_TAG).expect("Secondary should have pa tag");
let pa_info =
PrimaryAlignmentInfo::from_tag_value(pa_value).expect("Should be able to parse pa tag");
assert_eq!(pa_info.tid1, 0, "tid1 should be R1's reference");
assert_eq!(pa_info.pos1, 207, "pos1 should be R1's unclipped 5' position");
assert!(pa_info.neg1, "neg1 should be true (reverse strand)");
Ok(())
}
#[test]
fn test_add_pa_tag_paired_end_r2_supplementary() -> Result<()> {
use crate::template::Template;
let primary_r1 = RecordBuilder::new()
.name("q1")
.sequence("ACGTACGT")
.qualities(&[30; 8])
.reference_sequence_id(0)
.alignment_start(100)
.cigar("8M")
.flags(Flags::from(FLAG_PAIRED | FLAG_READ1))
.build();
let primary_r2 = RecordBuilder::new()
.name("q1")
.sequence("ACGTACGT")
.qualities(&[30; 8])
.reference_sequence_id(0)
.alignment_start(300)
.cigar("8M")
.flags(Flags::from(FLAG_PAIRED | FLAG_READ2 | FLAG_REVERSE))
.build();
let supplementary_r2 = RecordBuilder::new()
.name("q1")
.sequence("ACGT")
.qualities(&[30; 4])
.reference_sequence_id(5)
.alignment_start(5000)
.cigar("4M")
.flags(Flags::from(FLAG_PAIRED | FLAG_READ2 | FLAG_SUPPLEMENTARY))
.build();
let mut template = Template::from_records(vec![primary_r1, primary_r2, supplementary_r2])?;
add_primary_alignment_tags(&mut template);
let supp = template
.records
.iter()
.find(|r| r.flags().is_supplementary())
.expect("Should have supplementary");
let pa_value = supp.data().get(&PA_TAG).expect("Supplementary R2 should have pa tag");
let pa_info =
PrimaryAlignmentInfo::from_tag_value(pa_value).expect("Should be able to parse pa tag");
assert_eq!(pa_info.tid1, 0, "tid1 should be R1's reference (earlier)");
assert_eq!(pa_info.pos1, 100, "pos1 should be R1's unclipped 5' position (earlier)");
assert!(!pa_info.neg1, "neg1 should be false (R1 forward strand)");
assert_eq!(pa_info.tid2, 0, "tid2 should be R2's reference (later)");
assert_eq!(pa_info.pos2, 307, "pos2 should be R2's unclipped 5' position (later)");
assert!(pa_info.neg2, "neg2 should be true (R2 reverse strand)");
Ok(())
}
#[test]
fn test_no_pa_tag_when_no_primary() -> Result<()> {
use crate::template::Template;
let supplementary = RecordBuilder::new()
.name("q1")
.sequence("ACGT")
.qualities(&[30; 4])
.reference_sequence_id(5)
.alignment_start(5000)
.cigar("4M")
.flags(Flags::from(FLAG_PAIRED | FLAG_READ1 | FLAG_SUPPLEMENTARY))
.build();
let mut template = Template::from_records(vec![supplementary])?;
add_primary_alignment_tags(&mut template);
let supp = template
.records
.iter()
.find(|r| r.flags().is_supplementary())
.expect("Should have supplementary");
assert!(
supp.data().get(&PA_TAG).is_none(),
"Supplementary without primary should not have pa tag"
);
Ok(())
}
#[test]
fn test_pa_tag_added_during_merge() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let _ = mapped.add_pair().name("q1").start1(100).start2(200).build();
let supp = RecordBuilder::new()
.name("q1")
.sequence("ACGTACGTACGT")
.qualities(&[30; 12])
.reference_sequence_id(0)
.alignment_start(5000)
.cigar("12M")
.flags(Flags::from(FLAG_PAIRED | FLAG_READ1 | FLAG_SUPPLEMENTARY))
.build();
mapped.push_record(supp);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
let supp_record = records
.iter()
.find(|r| r.flags().is_supplementary())
.expect("Should have supplementary in output");
let pa_value =
supp_record.data().get(&PA_TAG).expect("Supplementary should have pa tag after merge");
let pa_info =
PrimaryAlignmentInfo::from_tag_value(pa_value).expect("Should be able to parse pa tag");
assert_eq!(pa_info.tid1, 0, "tid1 should be 0");
assert_eq!(pa_info.pos1, 100, "pos1 should be R1's unclipped 5' position");
assert_eq!(pa_info.tid2, 0, "tid2 should be 0");
assert_eq!(pa_info.pos2, 299, "pos2 should be R2's unclipped 5' position (reverse strand)");
let rx_tag = Tag::from(SamTag::RX);
assert!(
supp_record.data().get(&rx_tag).is_some(),
"Supplementary should also have RX tag copied from unmapped"
);
Ok(())
}
#[test]
fn test_add_pa_tag_early_exit_no_secondary_supplementary() -> Result<()> {
use crate::template::Template;
let primary_r1 = RecordBuilder::new()
.name("q1")
.sequence("ACGT")
.qualities(&[30, 30, 30, 30])
.reference_sequence_id(0)
.alignment_start(100)
.cigar("4M")
.flags(Flags::from(FLAG_PAIRED | FLAG_READ1))
.build();
let primary_r2 = RecordBuilder::new()
.name("q1")
.sequence("TGCA")
.qualities(&[30, 30, 30, 30])
.reference_sequence_id(0)
.alignment_start(200)
.cigar("4M")
.flags(Flags::from(FLAG_PAIRED | FLAG_READ2 | FLAG_REVERSE))
.build();
let mut template = Template::from_records(vec![primary_r1, primary_r2])?;
add_primary_alignment_tags(&mut template);
for record in &template.records {
assert!(record.data().get(&PA_TAG).is_none(), "Primary reads should not have pa tag");
}
Ok(())
}
#[test]
fn test_add_pa_tag_only_to_secondary_supplementary() -> Result<()> {
use crate::template::Template;
let primary_r1 = RecordBuilder::new()
.name("q1")
.sequence("ACGT")
.qualities(&[30, 30, 30, 30])
.reference_sequence_id(0)
.alignment_start(100)
.cigar("4M")
.flags(Flags::from(FLAG_PAIRED | FLAG_READ1))
.build();
let secondary_r1 = RecordBuilder::new()
.name("q1")
.sequence("ACGT")
.qualities(&[30, 30, 30, 30])
.reference_sequence_id(5)
.alignment_start(5000)
.cigar("4M")
.flags(Flags::from(FLAG_PAIRED | FLAG_READ1 | FLAG_SECONDARY))
.build();
let mut template = Template::from_records(vec![primary_r1, secondary_r1])?;
add_primary_alignment_tags(&mut template);
for record in &template.records {
if record.flags().is_secondary() {
assert!(record.data().get(&PA_TAG).is_some(), "Secondary should have pa tag");
} else {
assert!(record.data().get(&PA_TAG).is_none(), "Primary should not have pa tag");
}
}
Ok(())
}
#[test]
fn test_skip_pa_tags_parameter() -> Result<()> {
use crate::template::Template;
const FLAG_UNMAPPED: u16 = 0x4;
let primary_r1 = RecordBuilder::new()
.name("q1")
.sequence("ACGT")
.qualities(&[30, 30, 30, 30])
.reference_sequence_id(0)
.alignment_start(100)
.cigar("4M")
.flags(Flags::from(FLAG_PAIRED | FLAG_READ1))
.build();
let supplementary_r1 = RecordBuilder::new()
.name("q1")
.sequence("ACGT")
.qualities(&[30, 30, 30, 30])
.reference_sequence_id(5)
.alignment_start(5000)
.cigar("4M")
.flags(Flags::from(FLAG_PAIRED | FLAG_READ1 | FLAG_SUPPLEMENTARY))
.build();
let mut template = Template::from_records(vec![primary_r1, supplementary_r1])?;
let unmapped_record = RecordBuilder::new()
.name("q1")
.sequence("ACGT")
.qualities(&[30, 30, 30, 30])
.flags(Flags::from(FLAG_PAIRED | FLAG_READ1 | FLAG_UNMAPPED))
.tag("RX", "AAAA")
.build();
let unmapped = Template::from_records(vec![unmapped_record])?;
let tag_info = TagInfo::new(vec![], vec![], vec![]);
merge(&unmapped, &mut template, &tag_info, true)?;
for record in &template.records {
assert!(
record.data().get(&PA_TAG).is_none(),
"No records should have pa tag when skip_pa_tags=true"
);
}
Ok(())
}
#[test]
fn test_skip_pa_tags_false_adds_tags() -> Result<()> {
use crate::template::Template;
const FLAG_UNMAPPED: u16 = 0x4;
let primary_r1 = RecordBuilder::new()
.name("q1")
.sequence("ACGT")
.qualities(&[30, 30, 30, 30])
.reference_sequence_id(0)
.alignment_start(100)
.cigar("4M")
.flags(Flags::from(FLAG_PAIRED | FLAG_READ1))
.build();
let supplementary_r1 = RecordBuilder::new()
.name("q1")
.sequence("ACGT")
.qualities(&[30, 30, 30, 30])
.reference_sequence_id(5)
.alignment_start(5000)
.cigar("4M")
.flags(Flags::from(FLAG_PAIRED | FLAG_READ1 | FLAG_SUPPLEMENTARY))
.build();
let mut template = Template::from_records(vec![primary_r1, supplementary_r1])?;
let unmapped_record = RecordBuilder::new()
.name("q1")
.sequence("ACGT")
.qualities(&[30, 30, 30, 30])
.flags(Flags::from(FLAG_PAIRED | FLAG_READ1 | FLAG_UNMAPPED))
.tag("RX", "AAAA")
.build();
let unmapped = Template::from_records(vec![unmapped_record])?;
let tag_info = TagInfo::new(vec![], vec![], vec![]);
merge(&unmapped, &mut template, &tag_info, false)?;
let has_pa_tag = template
.records
.iter()
.any(|r| r.flags().is_supplementary() && r.data().get(&PA_TAG).is_some());
assert!(has_pa_tag, "Supplementary should have pa tag when skip_pa_tags=false");
Ok(())
}
#[test]
fn test_as_xs_normalization() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_frag_with_attrs("q1", None, true, &attrs);
let mapped_rec = RecordBuilder::new()
.name("q1")
.flags(Flags::empty())
.reference_sequence_id(0)
.alignment_start(100)
.cigar("100M")
.sequence(&"A".repeat(100))
.qualities(&[30u8; 100])
.tag("PG", MAPPED_PG_ID.to_string())
.tag("AS", 77i32)
.tag("XS", 50i32)
.build();
mapped.push_record(mapped_rec);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 1);
let as_val = records[0].data().get(&Tag::from(SamTag::AS)).unwrap();
assert!(matches!(as_val, BufValue::Int8(77)), "AS=77 should be Int8, got {as_val:?}");
let xs_val = records[0].data().get(&Tag::from(SamTag::XS)).unwrap();
assert!(matches!(xs_val, BufValue::Int8(50)), "XS=50 should be Int8, got {xs_val:?}");
Ok(())
}
#[test]
fn test_as_xs_normalization_large_values() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_frag_with_attrs("q1", None, true, &attrs);
let mapped_rec = RecordBuilder::new()
.name("q1")
.flags(Flags::empty())
.reference_sequence_id(0)
.alignment_start(100)
.cigar("100M")
.sequence(&"A".repeat(100))
.qualities(&[30u8; 100])
.tag("PG", MAPPED_PG_ID.to_string())
.tag("AS", 200i32)
.tag("XS", -200i32)
.build();
mapped.push_record(mapped_rec);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 1);
let as_val = records[0].data().get(&Tag::from(SamTag::AS)).unwrap();
assert!(matches!(as_val, BufValue::Int16(200)), "AS=200 should be Int16, got {as_val:?}");
let xs_val = records[0].data().get(&Tag::from(SamTag::XS)).unwrap();
assert!(matches!(xs_val, BufValue::Int16(-200)), "XS=-200 should be Int16, got {xs_val:?}");
Ok(())
}
#[test]
fn test_negative_strand_r1_tag_copying() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("n1", BufValue::from(vec![1i16, 2, 3]));
attrs.insert("s1", BufValue::from("ACGT".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped.add_pair_with_attrs("q1", Some(100), Some(200), false, true, &mapped_attrs);
let records =
run_zipper(&unmapped, &mapped, vec![], vec!["n1".to_string()], vec!["s1".to_string()])?;
assert_eq!(records.len(), 2);
let r1 = records.iter().find(|r| r.flags().is_first_segment()).unwrap();
let r2 = records.iter().find(|r| !r.flags().is_first_segment()).unwrap();
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = r1.data().get(&Tag::new(b'n', b'1'))
{
assert_eq!(*vals, vec![3i16, 2, 1], "n1 should be reversed for R1 (negative strand)");
} else {
panic!("n1 tag not found or wrong type on R1");
}
if let Some(BufValue::String(s)) = r1.data().get(&Tag::new(b's', b'1')) {
assert_eq!(s.to_string(), "ACGT", "s1 should be revcomped for R1 (negative strand)");
} else {
panic!("s1 tag not found or wrong type on R1");
}
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = r2.data().get(&Tag::new(b'n', b'1'))
{
assert_eq!(*vals, vec![1i16, 2, 3], "n1 should be unchanged for R2 (positive strand)");
} else {
panic!("n1 tag not found or wrong type on R2");
}
if let Some(BufValue::String(s)) = r2.data().get(&Tag::new(b's', b'1')) {
assert_eq!(s.to_string(), "ACGT", "s1 should be unchanged for R2 (positive strand)");
} else {
panic!("s1 tag not found or wrong type on R2");
}
Ok(())
}
#[test]
fn test_negative_strand_both_reads() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("n1", BufValue::from(vec![10i16, 20, 30]));
attrs.insert("s1", BufValue::from("AACC".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped.add_pair_with_attrs("q1", Some(200), Some(100), false, false, &mapped_attrs);
let records =
run_zipper(&unmapped, &mapped, vec![], vec!["n1".to_string()], vec!["s1".to_string()])?;
assert_eq!(records.len(), 2);
for rec in &records {
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = rec.data().get(&Tag::new(b'n', b'1'))
{
assert_eq!(*vals, vec![30i16, 20, 10], "n1 should be reversed on negative strand");
} else {
panic!("n1 tag not found or wrong type");
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'1')) {
assert_eq!(s.to_string(), "GGTT", "s1 should be revcomped on negative strand");
} else {
panic!("s1 tag not found or wrong type");
}
}
Ok(())
}
#[test]
fn test_fix_mate_info_values() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let r1_mapped = RecordBuilder::new()
.name("q1")
.flags(Flags::SEGMENTED | Flags::FIRST_SEGMENT | Flags::MATE_REVERSE_COMPLEMENTED)
.reference_sequence_id(0)
.alignment_start(100)
.cigar("50M")
.mapping_quality(40)
.sequence(&"A".repeat(50))
.qualities(&[30u8; 50])
.tag("PG", MAPPED_PG_ID.to_string())
.build();
let r2_mapped = RecordBuilder::new()
.name("q1")
.flags(
Flags::SEGMENTED
| Flags::LAST_SEGMENT
| Flags::REVERSE_COMPLEMENTED
| Flags::PROPERLY_SEGMENTED,
)
.reference_sequence_id(0)
.alignment_start(300)
.cigar("75M")
.mapping_quality(30)
.sequence(&"A".repeat(75))
.qualities(&[30u8; 75])
.tag("PG", MAPPED_PG_ID.to_string())
.build();
mapped.push_record(r1_mapped);
mapped.push_record(r2_mapped);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 2);
let r1 = records.iter().find(|r| r.flags().is_first_segment()).unwrap();
let r2 = records.iter().find(|r| !r.flags().is_first_segment()).unwrap();
assert_eq!(
r1.mate_reference_sequence_id(),
r2.reference_sequence_id(),
"R1's RNEXT should equal R2's reference"
);
assert_eq!(
r1.mate_alignment_start(),
r2.alignment_start(),
"R1's PNEXT should equal R2's position"
);
assert_eq!(
r2.mate_reference_sequence_id(),
r1.reference_sequence_id(),
"R2's RNEXT should equal R1's reference"
);
assert_eq!(
r2.mate_alignment_start(),
r1.alignment_start(),
"R2's PNEXT should equal R1's position"
);
let r1_tlen = r1.template_length();
let r2_tlen = r2.template_length();
assert!(r1_tlen != 0, "R1 TLEN should be non-zero for mapped pair");
assert_eq!(r1_tlen, -r2_tlen, "R1 TLEN should be negation of R2 TLEN");
let r1_mq = r1.data().get(&Tag::from(SamTag::MQ));
let r2_mq = r2.data().get(&Tag::from(SamTag::MQ));
assert!(
matches!(r1_mq, Some(BufValue::Int8(30))),
"R1's MQ should be R2's mapq (30), got {r1_mq:?}"
);
assert!(
matches!(r2_mq, Some(BufValue::Int8(40))),
"R2's MQ should be R1's mapq (40), got {r2_mq:?}"
);
if let Some(BufValue::String(mc)) = r1.data().get(&Tag::from(SamTag::MC)) {
assert_eq!(mc.to_string(), "75M", "R1's MC should be R2's CIGAR");
} else {
panic!("R1 should have MC tag");
}
if let Some(BufValue::String(mc)) = r2.data().get(&Tag::from(SamTag::MC)) {
assert_eq!(mc.to_string(), "50M", "R2's MC should be R1's CIGAR");
} else {
panic!("R2 should have MC tag");
}
Ok(())
}
#[test]
fn test_fix_mate_info_supplementary() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let r1_mapped = RecordBuilder::new()
.name("q1")
.flags(Flags::SEGMENTED | Flags::FIRST_SEGMENT | Flags::MATE_REVERSE_COMPLEMENTED)
.reference_sequence_id(0)
.alignment_start(100)
.cigar("50M")
.mapping_quality(40)
.sequence(&"A".repeat(50))
.qualities(&[30u8; 50])
.tag("PG", MAPPED_PG_ID.to_string())
.tag("AS", 77i32)
.build();
let r2_mapped = RecordBuilder::new()
.name("q1")
.flags(
Flags::SEGMENTED
| Flags::LAST_SEGMENT
| Flags::REVERSE_COMPLEMENTED
| Flags::PROPERLY_SEGMENTED,
)
.reference_sequence_id(0)
.alignment_start(300)
.cigar("75M")
.mapping_quality(30)
.sequence(&"A".repeat(75))
.qualities(&[30u8; 75])
.tag("PG", MAPPED_PG_ID.to_string())
.tag("AS", 55i32)
.build();
mapped.push_record(r1_mapped);
mapped.push_record(r2_mapped);
let supp_rec = RecordBuilder::new()
.name("q1")
.flags(
Flags::SEGMENTED
| Flags::FIRST_SEGMENT
| Flags::SUPPLEMENTARY
| Flags::REVERSE_COMPLEMENTED,
)
.reference_sequence_id(0)
.alignment_start(500)
.cigar("60M")
.mapping_quality(20)
.sequence(&"A".repeat(60))
.qualities(&[30u8; 60])
.tag("PG", MAPPED_PG_ID.to_string())
.tag("AS", 33i32)
.build();
mapped.push_record(supp_rec);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 3);
let r1_primary = records
.iter()
.find(|r| r.flags().is_first_segment() && !r.flags().is_supplementary())
.unwrap();
let r2_primary = records
.iter()
.find(|r| !r.flags().is_first_segment() && !r.flags().is_supplementary())
.unwrap();
let r1_supp = records
.iter()
.find(|r| r.flags().is_first_segment() && r.flags().is_supplementary())
.unwrap();
assert_eq!(
r1_supp.mate_reference_sequence_id(),
r2_primary.reference_sequence_id(),
"R1 supp RNEXT should point to R2 primary's reference"
);
assert_eq!(
r1_supp.mate_alignment_start(),
r2_primary.alignment_start(),
"R1 supp PNEXT should point to R2 primary's position"
);
let supp_mq = r1_supp.data().get(&Tag::from(SamTag::MQ));
assert!(
matches!(supp_mq, Some(BufValue::Int8(30))),
"R1 supp MQ should be R2 primary's mapq (30), got {supp_mq:?}"
);
if let Some(BufValue::String(mc)) = r1_supp.data().get(&Tag::from(SamTag::MC)) {
assert_eq!(mc.to_string(), "75M", "R1 supp MC should be R2 primary's CIGAR");
} else {
panic!("R1 supp should have MC tag");
}
let supp_ms = r1_supp.data().get(&Tag::from(SamTag::MS));
assert!(
matches!(supp_ms, Some(BufValue::Int8(55))),
"R1 supp ms should be R2 primary's AS (55), got {supp_ms:?}"
);
let r1_primary_tlen = r1_primary.template_length();
let r1_supp_tlen = r1_supp.template_length();
assert_eq!(
r1_supp_tlen, r1_primary_tlen,
"R1 supp TLEN should equal R1 primary TLEN (both = -R2_TLEN)"
);
Ok(())
}
#[test]
fn test_fix_mate_info_one_unmapped() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped.add_pair_with_attrs("q1", Some(100), None, true, true, &mapped_attrs);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 2);
let r1 = records.iter().find(|r| r.flags().is_first_segment()).unwrap();
let r2 = records.iter().find(|r| !r.flags().is_first_segment()).unwrap();
assert_eq!(r1.template_length(), 0, "R1 TLEN should be 0 when mate is unmapped");
assert_eq!(r2.template_length(), 0, "R2 TLEN should be 0 when mate is unmapped");
assert!(
r1.data().get(&Tag::from(SamTag::MQ)).is_none(),
"Mapped read should not have MQ when mate is unmapped"
);
assert!(
r1.data().get(&Tag::from(SamTag::MC)).is_none(),
"Mapped read should not have MC when mate is unmapped"
);
Ok(())
}
#[test]
fn test_fix_mate_info_r2_supplementary() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let r1_mapped = RecordBuilder::new()
.name("q1")
.flags(Flags::SEGMENTED | Flags::FIRST_SEGMENT | Flags::MATE_REVERSE_COMPLEMENTED)
.reference_sequence_id(0)
.alignment_start(100)
.cigar("50M")
.mapping_quality(40)
.sequence(&"A".repeat(50))
.qualities(&[30u8; 50])
.tag("PG", MAPPED_PG_ID.to_string())
.tag("AS", 77i32)
.build();
let r2_mapped = RecordBuilder::new()
.name("q1")
.flags(
Flags::SEGMENTED
| Flags::LAST_SEGMENT
| Flags::REVERSE_COMPLEMENTED
| Flags::PROPERLY_SEGMENTED,
)
.reference_sequence_id(0)
.alignment_start(300)
.cigar("75M")
.mapping_quality(30)
.sequence(&"A".repeat(75))
.qualities(&[30u8; 75])
.tag("PG", MAPPED_PG_ID.to_string())
.tag("AS", 55i32)
.build();
mapped.push_record(r1_mapped);
mapped.push_record(r2_mapped);
let supp_rec = RecordBuilder::new()
.name("q1")
.flags(
Flags::SEGMENTED
| Flags::LAST_SEGMENT
| Flags::SUPPLEMENTARY
| Flags::REVERSE_COMPLEMENTED,
)
.reference_sequence_id(0)
.alignment_start(500)
.cigar("60M")
.mapping_quality(20)
.sequence(&"A".repeat(60))
.qualities(&[30u8; 60])
.tag("PG", MAPPED_PG_ID.to_string())
.tag("AS", 33i32)
.build();
mapped.push_record(supp_rec);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 3);
let r1_primary = records
.iter()
.find(|r| r.flags().is_first_segment() && !r.flags().is_supplementary())
.unwrap();
let r2_supp = records
.iter()
.find(|r| !r.flags().is_first_segment() && r.flags().is_supplementary())
.unwrap();
assert_eq!(
r2_supp.mate_reference_sequence_id(),
r1_primary.reference_sequence_id(),
"R2 supp RNEXT should point to R1 primary's reference"
);
assert_eq!(
r2_supp.mate_alignment_start(),
r1_primary.alignment_start(),
"R2 supp PNEXT should point to R1 primary's position"
);
let supp_mq = r2_supp.data().get(&Tag::from(SamTag::MQ));
assert!(
matches!(supp_mq, Some(BufValue::Int8(40))),
"R2 supp MQ should be R1 primary's mapq (40), got {supp_mq:?}"
);
if let Some(BufValue::String(mc)) = r2_supp.data().get(&Tag::from(SamTag::MC)) {
assert_eq!(mc.to_string(), "50M", "R2 supp MC should be R1 primary's CIGAR");
} else {
panic!("R2 supp should have MC tag");
}
let supp_ms = r2_supp.data().get(&Tag::from(SamTag::MS));
assert!(
matches!(supp_ms, Some(BufValue::Int8(77))),
"R2 supp ms should be R1 primary's AS (77), got {supp_ms:?}"
);
Ok(())
}
#[test]
fn test_fix_mate_info_both_unmapped() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped.add_pair_with_attrs("q1", None, None, true, true, &mapped_attrs);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 2);
for rec in &records {
assert_eq!(rec.template_length(), 0, "TLEN should be 0 when both unmapped");
assert!(
rec.data().get(&Tag::from(SamTag::MQ)).is_none(),
"MQ should not exist when both unmapped"
);
assert!(
rec.data().get(&Tag::from(SamTag::MC)).is_none(),
"MC should not exist when both unmapped"
);
}
Ok(())
}
#[test]
fn test_varied_tag_types_roundtrip() -> Result<()> {
let mut unmapped = SamBuilder::new_unmapped();
let mut mapped = SamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
attrs.insert("ca", BufValue::Character(b'X'));
attrs.insert("fi", BufValue::Float(1.23));
attrs.insert("ui", BufValue::UInt8(200));
attrs.insert("si", BufValue::Int16(-300));
attrs.insert("ul", BufValue::UInt32(100_000));
attrs.insert("hx", BufValue::Hex(bstr::BString::from("DEADBEEF")));
unmapped.add_frag_with_attrs("q1", None, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped.add_frag_with_attrs("q1", Some(100), true, &mapped_attrs);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 1);
let rec = &records[0];
assert!(
matches!(rec.data().get(&Tag::new(b'c', b'a')), Some(BufValue::Character(b'X'))),
"Character tag should roundtrip"
);
if let Some(BufValue::Float(f)) = rec.data().get(&Tag::new(b'f', b'i')) {
assert!((f - 1.23).abs() < 0.001, "Float tag should roundtrip, got {f}");
} else {
panic!("Float tag missing");
}
let ui_val = rec.data().get(&Tag::new(b'u', b'i'));
assert!(ui_val.is_some(), "UInt8 tag should exist");
let si_val = rec.data().get(&Tag::new(b's', b'i'));
assert!(
matches!(si_val, Some(BufValue::Int16(-300))),
"Int16 tag should roundtrip, got {si_val:?}"
);
let ul_val = rec.data().get(&Tag::new(b'u', b'l'));
assert!(ul_val.is_some(), "UInt32 tag should exist");
if let Some(BufValue::Hex(h)) = rec.data().get(&Tag::new(b'h', b'x')) {
assert_eq!(h.to_string(), "DEADBEEF", "Hex tag should roundtrip");
} else {
panic!("Hex tag missing or wrong type");
}
Ok(())
}
#[rstest]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam"], false)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--exclude-missing-reads"], true)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--exclude-missing-reads", "true"], true)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--exclude-missing-reads", "false"], false)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--exclude-missing-reads=true"], true)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--exclude-missing-reads=false"], false)]
fn test_exclude_missing_reads_parsing(#[case] args: &[&str], #[case] expected: bool) {
let cmd = Zipper::try_parse_from(args).expect("failed to parse Zipper arguments");
assert_eq!(cmd.exclude_missing_reads, expected);
}
#[rstest]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam"], false)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--skip-pa-tags"], true)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--skip-pa-tags", "true"], true)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--skip-pa-tags", "false"], false)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--skip-pa-tags=true"], true)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--skip-pa-tags=false"], false)]
fn test_skip_pa_tags_parsing(#[case] args: &[&str], #[case] expected: bool) {
let cmd = Zipper::try_parse_from(args).expect("failed to parse Zipper arguments");
assert_eq!(cmd.skip_pa_tags, expected);
}
}