use crate::bam_io::{
RawBamReaderAuto, create_raw_bam_reader, create_raw_bam_writer, is_stdin_path,
};
use crate::batched_sam_reader::BatchedSamReader;
use crate::commands::command::Command;
use crate::commands::common::{CompressionOptions, parse_bool};
use crate::logging::OperationTimer;
use crate::progress::ProgressTracker;
use crate::reference::{ReferenceReader, find_dict_path};
use crate::sam::{TemplateCoordinateInfo, check_sort};
use crate::sort::bam_fields;
use crate::template::{Template, TemplateIterator};
use crate::umi::TagInfo;
use crate::validation::validate_file_exists;
use anyhow::{Context, Result};
use bstr::ByteSlice;
use clap::Parser;
use fgumi_raw_bam::{BAM_BASE_TO_ASCII, RawRecord, RawRecordView, RecordBufEncoder};
use log::{debug, info, warn};
use noodles::core::Position;
use noodles::sam::Header;
use std::collections::HashSet;
use std::io::{BufRead, BufReader, Read};
use std::path::{Path, PathBuf};
#[derive(Parser, Debug)]
#[command(
name = "zipper",
author,
version,
about = "\x1b[38;5;72m[ALIGNMENT]\x1b[0m \x1b[36mZip unmapped BAM with aligned BAM\x1b[0m",
long_about = r#"
Merges unmapped and mapped BAM files, transferring tags and metadata.
Takes an unmapped BAM (typically from FASTQ) and a mapped BAM (after alignment) and merges
them, copying tags from the unmapped to mapped reads. Both BAMs must be queryname sorted or
grouped, and have the same read name ordering.
The tool transfers tags from the unmapped reads to their corresponding mapped reads. For reads
mapped to the negative strand, tags can be optionally reversed or reverse-complemented. All
QC pass/fail flags are also transferred from the unmapped to mapped reads.
## Tag Manipulation
You can specify which tags to manipulate for reads mapped to the negative strand:
- --tags-to-reverse: Reverses array and string tags (e.g., [1,2,3] becomes [3,2,1])
- --tags-to-revcomp: Reverse complements sequence tags (e.g., AGAGG becomes CCTCT)
Named tag sets like "Consensus" are automatically expanded to their constituent tags:
- Consensus: aD bD cD aM bM cM aE bE cE ad bd cd ae be ce ac bc
## Default Behavior
By default, input is read from stdin and output is written to stdout, allowing for streaming
workflows like:
bwa mem -t 8 -p -K 150000000 -Y ref.fa reads.fq | fgumi zipper -u unmapped.bam -r ref.fa | fgumi sort -i /dev/stdin -o output.bam --order template-coordinate
"#
)]
#[command(verbatim_doc_comment)]
pub struct Zipper {
#[arg(short = 'i', long, default_value = "-")]
pub input: PathBuf,
#[arg(short = 'u', long)]
pub unmapped: PathBuf,
#[arg(short = 'r', long)]
pub reference: PathBuf,
#[arg(short = 'o', long, default_value = "-")]
pub output: PathBuf,
#[arg(long, value_delimiter = ',')]
pub tags_to_remove: Vec<String>,
#[arg(long, value_delimiter = ',')]
pub tags_to_reverse: Vec<String>,
#[arg(long, value_delimiter = ',')]
pub tags_to_revcomp: Vec<String>,
#[arg(short = 'b', long, default_value = "50000")]
pub buffer: usize,
#[arg(long, short = 't', default_value = "1")]
pub threads: usize,
#[command(flatten)]
pub compression: CompressionOptions,
#[arg(short = 'K', long = "bwa-chunk-size", default_value = "150000000")]
pub bwa_chunk_size: u64,
#[arg(long = "exclude-missing-reads", default_value = "false", num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set, value_parser = parse_bool)]
pub exclude_missing_reads: bool,
#[arg(long = "skip-pa-tags", default_value = "false", num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set, value_parser = parse_bool)]
pub skip_tc_tags: bool,
#[arg(long = "restore-unconverted-bases", default_value = "false", num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set, value_parser = parse_bool)]
pub restore_unconverted_bases: bool,
}
pub fn build_output_header(unmapped: &Header, mapped: &Header, dict_path: &Path) -> Result<Header> {
let dict_file =
std::fs::File::open(dict_path).context("Failed to open reference dictionary")?;
let mut dict_reader = noodles::sam::io::Reader::new(std::io::BufReader::new(dict_file));
let dict_header = dict_reader.read_header()?;
let mut header = Header::builder();
for (name, map) in dict_header.reference_sequences() {
header = header.add_reference_sequence(name.clone(), map.clone());
}
for comment in unmapped.comments() {
header = header.add_comment(comment.clone());
}
for comment in mapped.comments() {
header = header.add_comment(comment.clone());
}
let mut rg_ids = HashSet::new();
for (id, rg) in mapped.read_groups() {
header = header.add_read_group(id.clone(), rg.clone());
rg_ids.insert(id.clone());
}
for (id, rg) in unmapped.read_groups() {
if !rg_ids.contains(id) {
header = header.add_read_group(id.clone(), rg.clone());
}
}
let mut pg_ids = HashSet::new();
for (id, pg) in mapped.programs().as_ref() {
header = header.add_program(id.clone(), pg.clone());
pg_ids.insert(id.clone());
}
for (id, pg) in unmapped.programs().as_ref() {
if !pg_ids.contains(id) {
header = header.add_program(id.clone(), pg.clone());
}
}
if let Some(hdr) = unmapped.header() {
header = header.set_header(hdr.clone());
}
Ok(header.build())
}
fn add_template_coordinate_tags_raw(mapped: &mut Template) {
let rr = &mapped.records;
let has_sec_supp = rr.iter().any(|r| {
let f = RawRecordView::new(r).flags();
(f & bam_fields::flags::SECONDARY) != 0 || (f & bam_fields::flags::SUPPLEMENTARY) != 0
});
if !has_sec_supp {
return;
}
let r1_info: Option<(i32, i32, bool)> = mapped.r1.and_then(|(i, _)| {
let r = &rr[i];
let f = RawRecordView::new(r).flags();
if (f & bam_fields::flags::UNMAPPED) != 0 {
return None;
}
let ref_id = bam_fields::ref_id(r);
let pos = bam_fields::unclipped_5prime_from_raw_bam(r);
let is_reverse = (f & bam_fields::flags::REVERSE) != 0;
Some((ref_id, pos, is_reverse))
});
let r2_info: Option<(i32, i32, bool)> = mapped.r2.and_then(|(i, _)| {
let r = &rr[i];
let f = RawRecordView::new(r).flags();
if (f & bam_fields::flags::UNMAPPED) != 0 {
return None;
}
let ref_id = bam_fields::ref_id(r);
let pos = bam_fields::unclipped_5prime_from_raw_bam(r);
let is_reverse = (f & bam_fields::flags::REVERSE) != 0;
Some((ref_id, pos, is_reverse))
});
let tc_info: Option<TemplateCoordinateInfo> = match (r1_info, r2_info) {
(Some((t1, p1, n1)), Some((t2, p2, n2))) => {
if (t1, p1) <= (t2, p2) {
Some(TemplateCoordinateInfo::new(t1, p1, n1, t2, p2, n2))
} else {
Some(TemplateCoordinateInfo::new(t2, p2, n2, t1, p1, n1))
}
}
(Some((t, p, n)), None) | (None, Some((t, p, n))) => {
Some(TemplateCoordinateInfo::new(t, p, n, t, p, n))
}
(None, None) => None,
};
let Some(tc_info) = tc_info else {
return;
};
let tc_tag = b"tc";
let tc_values = [
tc_info.tid1,
tc_info.pos1,
i32::from(tc_info.neg1),
tc_info.tid2,
tc_info.pos2,
i32::from(tc_info.neg2),
];
for record in mapped.records.iter_mut() {
let f = RawRecordView::new(record.as_ref()).flags();
if (f & bam_fields::flags::SECONDARY) != 0 || (f & bam_fields::flags::SUPPLEMENTARY) != 0 {
bam_fields::remove_tag(record.as_mut_vec(), tc_tag);
bam_fields::append_i32_array_tag(record.as_mut_vec(), tc_tag, &tc_values);
}
}
}
fn collect_mapped_indices(mapped: &Template, is_first_segment: bool) -> Vec<usize> {
let mut indices = Vec::with_capacity(4);
if is_first_segment {
if let Some((i, _)) = mapped.r1 {
indices.push(i);
}
if let Some((s, e)) = mapped.r1_supplementals {
indices.extend(s..e);
}
if let Some((s, e)) = mapped.r1_secondaries {
indices.extend(s..e);
}
} else {
if let Some((i, _)) = mapped.r2 {
indices.push(i);
}
if let Some((s, e)) = mapped.r2_supplementals {
indices.extend(s..e);
}
if let Some((s, e)) = mapped.r2_secondaries {
indices.extend(s..e);
}
}
indices
}
pub fn merge_raw(
unmapped: &Template,
mapped: &mut Template,
tag_info: &TagInfo,
skip_tc_tags: bool,
) -> Result<()> {
mapped.fix_mate_info()?;
for record in mapped.records_mut().iter_mut() {
for tag_str in &tag_info.remove {
if tag_str.len() == 2 {
let tag_bytes: [u8; 2] = [tag_str.as_bytes()[0], tag_str.as_bytes()[1]];
bam_fields::remove_tag(record.as_mut_vec(), &tag_bytes);
}
}
}
let has_transforms = tag_info.has_revs_or_revcomps();
let pg_tag: [u8; 2] = [b'P', b'G'];
for u in unmapped.primary_reads() {
let u_flags = RawRecordView::new(u).flags();
let is_unpaired = (u_flags & bam_fields::flags::PAIRED) == 0;
let is_first = (u_flags & bam_fields::flags::FIRST_SEGMENT) != 0;
let mapped_indices = collect_mapped_indices(mapped, is_unpaired || is_first);
let (has_pos, has_neg) = {
let rr = mapped.records();
let mut pos = false;
let mut neg = false;
for &i in &mapped_indices {
if (RawRecordView::new(&rr[i]).flags() & bam_fields::flags::REVERSE) == 0 {
pos = true;
} else {
neg = true;
}
if pos && neg {
break;
}
}
(pos, neg)
};
let u_tags: Vec<fgumi_raw_bam::TagEntry<'_>> =
RawRecordView::new(u).tags().iter().collect();
if has_pos {
let rr = mapped.records_mut();
for &i in &mapped_indices {
if (RawRecordView::new(&rr[i]).flags() & bam_fields::flags::REVERSE) != 0 {
continue;
}
let aux = bam_fields::aux_data_slice(&rr[i]);
let has_pg = bam_fields::find_tag_type(aux, &pg_tag).is_some();
for entry in &u_tags {
if entry.tag == pg_tag && has_pg {
continue;
}
let tag_str = std::str::from_utf8(&entry.tag).unwrap_or("");
if tag_info.remove.contains(tag_str) {
continue;
}
bam_fields::remove_tag(rr[i].as_mut_vec(), &entry.tag);
append_raw_tag_entry(rr[i].as_mut_vec(), entry);
}
}
}
if has_neg {
let rr = mapped.records_mut();
for &i in &mapped_indices {
if (RawRecordView::new(&rr[i]).flags() & bam_fields::flags::REVERSE) == 0 {
continue;
}
let aux = bam_fields::aux_data_slice(&rr[i]);
let has_pg = bam_fields::find_tag_type(aux, &pg_tag).is_some();
let aux_offset =
bam_fields::aux_data_offset_from_record(&rr[i]).unwrap_or(rr[i].len());
for entry in &u_tags {
if entry.tag == pg_tag && has_pg {
continue;
}
let tag_str = std::str::from_utf8(&entry.tag).unwrap_or("");
if tag_info.remove.contains(tag_str) {
continue;
}
bam_fields::remove_tag(rr[i].as_mut_vec(), &entry.tag);
append_raw_tag_entry(rr[i].as_mut_vec(), entry);
if has_transforms && tag_info.reverse.contains(tag_str) {
reverse_tag_in_place_raw_by_type(
&mut rr[i],
aux_offset,
entry.tag,
entry.type_byte,
);
} else if has_transforms && tag_info.revcomp.contains(tag_str) {
revcomp_tag_in_place_raw_by_type(
&mut rr[i],
aux_offset,
entry.tag,
entry.type_byte,
);
}
}
}
}
let is_qc_fail = (u_flags & bam_fields::flags::QC_FAIL) != 0;
let rr = mapped.records_mut();
for &i in &mapped_indices {
let mut f = RawRecordView::new(&rr[i]).flags();
if is_qc_fail {
f |= bam_fields::flags::QC_FAIL;
} else {
f &= !bam_fields::flags::QC_FAIL;
}
bam_fields::set_flags(&mut rr[i], f);
}
}
for record in mapped.records_mut().iter_mut() {
bam_fields::normalize_int_tag_to_smallest_signed(record.as_mut_vec(), b"AS");
bam_fields::normalize_int_tag_to_smallest_signed(record.as_mut_vec(), b"XS");
}
if !skip_tc_tags {
add_template_coordinate_tags_raw(mapped);
}
Ok(())
}
#[inline]
fn append_raw_tag_entry(dest: &mut Vec<u8>, entry: &fgumi_raw_bam::TagEntry<'_>) {
dest.push(entry.tag[0]);
dest.push(entry.tag[1]);
dest.push(entry.type_byte);
dest.extend_from_slice(entry.value_bytes);
}
fn reverse_tag_in_place_raw_by_type(
record: &mut [u8],
aux_offset: usize,
tag: [u8; 2],
type_byte: u8,
) {
match type_byte {
b'Z' => {
bam_fields::reverse_string_tag_in_place(record, aux_offset, &tag);
}
b'B' => {
bam_fields::reverse_array_tag_in_place(record, aux_offset, &tag);
}
_ => {}
}
}
fn revcomp_tag_in_place_raw_by_type(
record: &mut [u8],
aux_offset: usize,
tag: [u8; 2],
type_byte: u8,
) {
match type_byte {
b'Z' => {
bam_fields::reverse_complement_string_tag_in_place(record, aux_offset, &tag);
}
b'B' => {
bam_fields::reverse_array_tag_in_place(record, aux_offset, &tag);
}
_ => {}
}
}
fn encode_unmapped_template_records(
template: &Template,
_header: &Header,
) -> Result<Vec<RawRecord>> {
Ok(template.records().to_vec())
}
const YD_FORWARD: &[u8] = b"f";
const YD_REVERSE: &[u8] = b"r";
fn restore_unconverted_bases_in_raw_template(
template: &mut Template,
reference: &ReferenceReader,
header: &Header,
) -> Result<()> {
for rec in template.records_mut().iter_mut() {
restore_unconverted_bases_in_raw_record(rec, reference, header)?;
}
Ok(())
}
fn restore_unconverted_bases_in_raw_record(
rec: &mut RawRecord,
reference: &ReferenceReader,
header: &Header,
) -> Result<()> {
if rec.is_unmapped() {
return Ok(());
}
let yd_bytes = rec.tags().find_string(b"YD").map(|s| s.to_vec());
let is_top = match yd_bytes.as_deref() {
Some(s) if s == YD_FORWARD => true,
Some(s) if s == YD_REVERSE => false,
_ => return Ok(()), };
let raw_ref_id = rec.ref_id();
if raw_ref_id < 0 {
return Ok(());
}
let ref_id_usize = raw_ref_id as usize;
let (ref_name, _) = header
.reference_sequences()
.get_index(ref_id_usize)
.context("reference sequence ID not found in header")?;
let ref_name: &str = ref_name.to_str().context("reference sequence name is not valid UTF-8")?;
let alignment_start = match rec.alignment_start_1based() {
Some(pos) => pos,
None => return Ok(()),
};
let ref_span = rec.reference_length();
if ref_span <= 0 {
return Ok(());
}
let ref_span = ref_span as usize;
let ref_start = Position::try_from(alignment_start)?;
let ref_end = Position::try_from(alignment_start + ref_span - 1)?;
let ref_bases = reference.fetch_slice(ref_name, ref_start, ref_end)?;
let is_reverse = rec.is_reverse();
let (ref_target, converted_base, unconverted_base) = match (is_top, is_reverse) {
(true, false) | (false, true) => (b'C', b'T', b'C'),
(true, true) | (false, false) => (b'G', b'A', b'G'),
};
let ref_target_lower = ref_target.to_ascii_lowercase();
let converted_base_lower = converted_base.to_ascii_lowercase();
if memchr::memchr2(ref_target, ref_target_lower, ref_bases).is_none() {
return Ok(());
}
let cigar_ops = rec.cigar_ops_vec();
let l_seq = rec.l_seq() as usize;
let mut changed = false;
let mut read_pos: usize = 0;
let mut ref_offset: usize = 0;
for op in &cigar_ops {
let op_type = op & 0xF;
let len = (op >> 4) as usize;
match op_type {
0 | 7 | 8 => {
for i in 0..len {
if ref_offset + i >= ref_bases.len() {
break;
}
let rb = ref_bases[ref_offset + i];
if (rb == ref_target || rb == ref_target_lower) && read_pos + i < l_seq {
let raw_code = rec.get_base(read_pos + i);
let sb = BAM_BASE_TO_ASCII[raw_code as usize];
if sb == converted_base || sb == converted_base_lower {
rec.set_base(read_pos + i, unconverted_base);
changed = true;
}
}
}
read_pos += len;
ref_offset += len;
}
1 | 4 => {
read_pos += len;
}
2 | 3 => {
ref_offset += len;
}
5 | 6 => {
}
_ => {}
}
}
if changed {
let mut ed = rec.tags_editor();
ed.remove(b"NM");
ed.remove(b"MD");
}
Ok(())
}
fn record_bufs_to_templates<'a, I>(
iter: I,
header: &'a Header,
) -> impl Iterator<Item = Result<Template>> + 'a
where
I: Iterator<Item = Result<noodles::sam::alignment::RecordBuf>> + 'a,
{
let mut encoder = RecordBufEncoder::new(header);
let mut pending: Option<noodles::sam::alignment::RecordBuf> = None;
let mut exhausted = false;
let mut iter = iter;
std::iter::from_fn(move || {
if exhausted && pending.is_none() {
return None;
}
let mut batch: Vec<noodles::sam::alignment::RecordBuf> = Vec::with_capacity(2);
if let Some(p) = pending.take() {
batch.push(p);
}
loop {
if exhausted {
break;
}
match iter.next() {
Some(Ok(rec)) => {
if batch.is_empty() {
batch.push(rec);
} else {
let first_name = batch[0].name().map(|n| n.to_vec());
let this_name = rec.name().map(|n| n.to_vec());
if first_name == this_name {
batch.push(rec);
} else {
pending = Some(rec);
break;
}
}
}
Some(Err(e)) => return Some(Err(e)),
None => {
exhausted = true;
break;
}
}
}
if batch.is_empty() {
return None;
}
let mut raw_records = Vec::with_capacity(batch.len());
for rec in &batch {
match encoder.encode(rec) {
Ok(raw) => raw_records.push(raw),
Err(e) => return Some(Err(anyhow::anyhow!("Failed to encode record: {e}"))),
}
}
Some(Template::from_records(raw_records))
})
}
impl Zipper {
fn process_raw<U, M>(
&self,
unmapped_iter: U,
mut mapped_iter: M,
output_header: &Header,
tag_info: &TagInfo,
reference: Option<&ReferenceReader>,
) -> Result<u64>
where
U: Iterator<Item = Result<Template>>,
M: Iterator<Item = Result<Template>>,
{
let mut writer = create_raw_bam_writer(
&self.output,
output_header,
self.threads,
self.compression.compression_level,
)?;
let progress = ProgressTracker::new("Processed records").with_interval(1_000_000);
let mut mapped_peek: Option<Template> = None;
let mut templates_not_in_mapped_bam: u64 = 0;
for unmapped_result in unmapped_iter {
let unmapped_template = unmapped_result?;
if mapped_peek.is_none() {
mapped_peek = mapped_iter.next().transpose()?;
}
if let Some(ref mut mapped_template) = mapped_peek {
if mapped_template.name == unmapped_template.name {
merge_raw(&unmapped_template, mapped_template, tag_info, self.skip_tc_tags)?;
if let Some(ref_reader) = reference {
restore_unconverted_bases_in_raw_template(
mapped_template,
ref_reader,
output_header,
)?;
}
for rec in mapped_template.records() {
writer.write_raw_record(rec)?;
progress.log_if_needed(1);
}
mapped_peek = None;
} else {
debug!(
"Found unmapped read with no corresponding mapped \
read: {}",
String::from_utf8_lossy(&unmapped_template.name)
);
if self.exclude_missing_reads {
templates_not_in_mapped_bam += 1;
} else {
let raw =
encode_unmapped_template_records(&unmapped_template, output_header)?;
for rec in &raw {
writer.write_raw_record(rec)?;
progress.log_if_needed(1);
}
}
}
} else {
debug!(
"Found unmapped read with no corresponding mapped \
read: {}",
String::from_utf8_lossy(&unmapped_template.name)
);
if self.exclude_missing_reads {
templates_not_in_mapped_bam += 1;
} else {
let raw = encode_unmapped_template_records(&unmapped_template, output_header)?;
for rec in &raw {
writer.write_raw_record(rec)?;
progress.log_if_needed(1);
}
}
}
}
progress.log_final();
if let Some(remaining) = mapped_peek {
anyhow::bail!(
"Error: processed all unmapped reads but there are mapped \
reads remaining. Found template '{}'. Please ensure the \
unmapped and mapped reads have the same set of read names \
in the same order, and reads with the same name are \
consecutive (grouped) in each input.",
String::from_utf8_lossy(&remaining.name)
);
}
if let Some(remaining) = mapped_iter.next() {
let remaining = remaining?;
anyhow::bail!(
"Error: processed all unmapped reads but there are mapped \
reads remaining. Found template '{}'. Please ensure the \
unmapped and mapped reads have the same set of read names \
in the same order, and reads with the same name are \
consecutive (grouped) in each input.",
String::from_utf8_lossy(&remaining.name)
);
}
if self.exclude_missing_reads && templates_not_in_mapped_bam > 0 {
info!(
"Excluded {templates_not_in_mapped_bam} templates that \
were not present in the aligned BAM."
);
}
writer.finish()?;
Ok(progress.count())
}
}
type RawBamStdinReader =
fgumi_raw_bam::RawBamReader<noodles::bgzf::io::Reader<BufReader<Box<dyn Read + Send>>>>;
enum MappedReader {
Sam(noodles::sam::io::Reader<Box<dyn BufRead + Send>>),
Bam(RawBamReaderAuto),
StdinBam(RawBamStdinReader),
}
const BGZF_MAGIC: [u8; 4] = [0x1f, 0x8b, 0x08, 0x04];
fn peek_stdin_bgzf(mut stdin: Box<dyn Read + Send>) -> Result<(bool, Box<dyn Read + Send>)> {
let mut prefix = [0u8; BGZF_MAGIC.len()];
let mut filled = 0usize;
while filled < prefix.len() {
match stdin.read(&mut prefix[filled..]) {
Ok(0) => break,
Ok(n) => filled += n,
Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
Err(e) => return Err(e).context("Failed to peek stdin"),
}
}
let is_bgzf = filled == BGZF_MAGIC.len() && prefix == BGZF_MAGIC;
let restored: Box<dyn Read + Send> =
Box::new(std::io::Cursor::new(prefix[..filled].to_vec()).chain(stdin));
Ok((is_bgzf, restored))
}
impl Command for Zipper {
fn execute(&self, command_line: &str) -> Result<()> {
info!("Starting zipper");
let timer = OperationTimer::new("Zipping BAMs");
validate_file_exists(&self.unmapped, "unmapped BAM file")?;
validate_file_exists(&self.reference, "reference FASTA file")?;
let dict_path = find_dict_path(&self.reference).ok_or_else(|| {
anyhow::anyhow!(
"Reference dictionary file not found. Tried:\n \
- {}\n \
- {}.dict\n\
Please run: samtools dict {} -o {}",
self.reference.with_extension("dict").display(),
self.reference.display(),
self.reference.display(),
self.reference.with_extension("dict").display()
)
})?;
let (unmapped_raw_reader, unmapped_header) = create_raw_bam_reader(&self.unmapped, 1)?;
let (mapped_reader, mapped_header) = if is_stdin_path(&self.input) {
let stdin: Box<dyn Read + Send> = Box::new(std::io::stdin());
let (is_bgzf, stdin) = peek_stdin_bgzf(stdin)?;
let buffered = BufReader::with_capacity(64 * 1024, stdin);
if is_bgzf {
warn!(
"BAM input detected on stdin. For best performance, pipe SAM directly \
from the aligner (e.g. bwa mem ... | fgumi zipper ...)."
);
let bgzf = noodles::bgzf::io::Reader::new(buffered);
let mut bam_reader = noodles::bam::io::Reader::from(bgzf);
let header = bam_reader.read_header()?;
let bgzf = bam_reader.into_inner();
(MappedReader::StdinBam(fgumi_raw_bam::RawBamReader::new(bgzf)), header)
} else {
info!(
"Reading SAM from stdin with adaptive buffer (bwa -K {})",
self.bwa_chunk_size
);
let reader: Box<dyn BufRead + Send> =
Box::new(BatchedSamReader::new(buffered, self.bwa_chunk_size));
let mut sam_reader = noodles::sam::io::Reader::new(reader);
let header = sam_reader.read_header()?;
(MappedReader::Sam(sam_reader), header)
}
} else if self.input.extension().is_some_and(|ext| ext.eq_ignore_ascii_case("bam")) {
warn!(
"BAM input detected for --input. For best performance, pipe SAM directly \
from the aligner (e.g. bwa mem ... | fgumi zipper ...)."
);
let (raw_reader, header) = create_raw_bam_reader(&self.input, self.threads)?;
(MappedReader::Bam(raw_reader), header)
} else {
let reader: Box<dyn BufRead + Send> = Box::new(BufReader::with_capacity(
256 * 1024,
std::fs::File::open(&self.input).context("Failed to open mapped SAM")?,
));
let mut sam_reader = noodles::sam::io::Reader::new(reader);
let header = sam_reader.read_header()?;
(MappedReader::Sam(sam_reader), header)
};
check_sort(&unmapped_header, &self.unmapped, "unmapped");
check_sort(&mapped_header, &self.input, "mapped");
let output_header = build_output_header(&unmapped_header, &mapped_header, &dict_path)?;
let output_header = crate::commands::common::add_pg_record(output_header, command_line)?;
let tag_info = TagInfo::new(
self.tags_to_remove.clone(),
self.tags_to_reverse.clone(),
self.tags_to_revcomp.clone(),
);
if !tag_info.remove.is_empty() {
info!("Tags for removal: {:?}", tag_info.remove);
}
if !tag_info.reverse.is_empty() {
info!("Tags being reversed: {:?}", tag_info.reverse);
}
if !tag_info.revcomp.is_empty() {
info!("Tags being reverse complemented: {:?}", tag_info.revcomp);
}
let reference = if self.restore_unconverted_bases {
info!("Loading reference FASTA for unconverted base restoration");
Some(ReferenceReader::new(&self.reference)?)
} else {
None
};
let (unmapped_tx, unmapped_rx) =
std::sync::mpsc::sync_channel::<Result<Template>>(self.buffer);
std::thread::spawn(move || {
let unmapped_iter = TemplateIterator::new(unmapped_raw_reader);
for template in unmapped_iter {
if unmapped_tx.send(template).is_err() {
break; }
}
});
let unmapped_iter = std::iter::from_fn(move || unmapped_rx.recv().ok());
let (mapped_tx, mapped_rx) = std::sync::mpsc::sync_channel::<Result<Template>>(self.buffer);
let mapped_header_for_reader = mapped_header.clone();
std::thread::spawn(move || {
match mapped_reader {
MappedReader::Sam(mut r) => {
let record_iter = r
.record_bufs(&mapped_header_for_reader)
.map(|rec| rec.map_err(anyhow::Error::from));
for template in record_bufs_to_templates(record_iter, &mapped_header_for_reader)
{
if mapped_tx.send(template).is_err() {
break;
}
}
}
MappedReader::Bam(r) => {
for template in TemplateIterator::new(r) {
if mapped_tx.send(template).is_err() {
break;
}
}
}
MappedReader::StdinBam(r) => {
for template in TemplateIterator::new(r) {
if mapped_tx.send(template).is_err() {
break;
}
}
}
}
});
let mapped_iter = std::iter::from_fn(move || mapped_rx.recv().ok());
let total_records = self.process_raw(
unmapped_iter,
mapped_iter,
&output_header,
&tag_info,
reference.as_ref(),
)?;
info!("zipper completed successfully");
timer.log_completion(total_records);
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::sam::SamTag;
use crate::sam::TC_TAG;
use crate::sam::builder::{
MAPPED_PG_ID, REFERENCE_LENGTH, SamBuilder as FgSamBuilder, create_ref_dict,
};
use anyhow::Result;
use bstr::ByteSlice;
use fgumi_raw_bam::{RawRecord, SamBuilder as RawSamBuilder, flags, testutil::encode_op};
use noodles::sam::alignment::record::data::field::Tag;
use noodles::sam::alignment::record_buf::RecordBuf;
use noodles::sam::alignment::record_buf::data::field::Value as BufValue;
use rstest::rstest;
use std::collections::HashMap;
use std::io::Read;
use tempfile::TempDir;
fn to_record_buf(raw: RawRecord) -> RecordBuf {
fgumi_raw_bam::raw_record_to_record_buf(&raw, &noodles::sam::Header::default())
.expect("raw_record_to_record_buf failed")
}
fn tc_info_from_raw(rec: &RawRecord) -> Option<TemplateCoordinateInfo> {
let arr = rec.tags().find_array(b"tc")?;
if arr.elem_type != b'i' || arr.count != 6 {
return None;
}
let read_i32 = |idx: usize| -> i32 {
let off = idx * 4;
i32::from_le_bytes([
arr.data[off],
arr.data[off + 1],
arr.data[off + 2],
arr.data[off + 3],
])
};
Some(TemplateCoordinateInfo {
tid1: read_i32(0),
pos1: read_i32(1),
neg1: read_i32(2) != 0,
tid2: read_i32(3),
pos2: read_i32(4),
neg2: read_i32(5) != 0,
})
}
struct TrickleReader(std::collections::VecDeque<u8>);
impl Read for TrickleReader {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
if buf.is_empty() {
return Ok(0);
}
match self.0.pop_front() {
Some(b) => {
buf[0] = b;
Ok(1)
}
None => Ok(0),
}
}
}
fn trickle(bytes: &[u8]) -> Box<dyn Read + Send> {
Box::new(TrickleReader(bytes.iter().copied().collect()))
}
#[test]
fn test_peek_stdin_bgzf_classifies_bam_under_short_reads() -> Result<()> {
let mut stream = BGZF_MAGIC.to_vec();
stream.extend_from_slice(b"rest-of-bgzf-stream");
let (is_bgzf, mut restored) = peek_stdin_bgzf(trickle(&stream))?;
assert!(is_bgzf);
let mut round_trip = Vec::new();
restored.read_to_end(&mut round_trip)?;
assert_eq!(round_trip, stream);
Ok(())
}
#[test]
fn test_peek_stdin_bgzf_classifies_sam_text() -> Result<()> {
let stream = b"@HD\tVN:1.6\tSO:queryname\n".to_vec();
let (is_bgzf, mut restored) = peek_stdin_bgzf(trickle(&stream))?;
assert!(!is_bgzf);
let mut round_trip = Vec::new();
restored.read_to_end(&mut round_trip)?;
assert_eq!(round_trip, stream);
Ok(())
}
#[test]
fn test_peek_stdin_bgzf_handles_stream_shorter_than_magic() -> Result<()> {
let stream = b"@H".to_vec();
let (is_bgzf, mut restored) = peek_stdin_bgzf(trickle(&stream))?;
assert!(!is_bgzf);
let mut round_trip = Vec::new();
restored.read_to_end(&mut round_trip)?;
assert_eq!(round_trip, stream);
Ok(())
}
#[test]
fn test_peek_stdin_bgzf_handles_empty_stream() -> Result<()> {
let (is_bgzf, mut restored) = peek_stdin_bgzf(Box::new(std::io::empty()))?;
assert!(!is_bgzf);
let mut round_trip = Vec::new();
restored.read_to_end(&mut round_trip)?;
assert!(round_trip.is_empty());
Ok(())
}
fn run_zipper(
unmapped: &FgSamBuilder,
mapped: &FgSamBuilder,
tags_to_remove: Vec<String>,
tags_to_reverse: Vec<String>,
tags_to_revcomp: Vec<String>,
) -> Result<Vec<RecordBuf>> {
let dir = TempDir::new()?;
let unmapped_path = dir.path().join("unmapped.bam");
let mapped_path = dir.path().join("mapped.sam");
let output_path = dir.path().join("output.bam");
let dict_path = create_ref_dict(&dir, "chr1", REFERENCE_LENGTH)?;
unmapped.write(&unmapped_path)?;
mapped.write_sam(&mapped_path)?;
let zipper = Zipper {
input: mapped_path.clone(),
unmapped: unmapped_path.clone(),
reference: dict_path,
output: output_path.clone(),
tags_to_remove,
tags_to_reverse,
tags_to_revcomp,
buffer: 5000,
threads: 1,
compression: CompressionOptions { compression_level: 1 },
bwa_chunk_size: 150_000_000,
exclude_missing_reads: false,
skip_tc_tags: false,
restore_unconverted_bases: false,
};
zipper.execute("test")?;
read_bam_records(&output_path)
}
fn read_bam_records(path: &std::path::Path) -> Result<Vec<RecordBuf>> {
let mut reader = noodles::bam::io::reader::Builder.build_from_path(path)?;
let header = reader.read_header()?;
Ok(reader.record_bufs(&header).collect::<std::io::Result<Vec<_>>>()?)
}
#[test]
fn test_basic_merge() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs1 = HashMap::new();
attrs1.insert("RX", BufValue::from("ACGT".to_string()));
attrs1.insert("xy", BufValue::from(1234i32));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs1);
let mut attrs2 = HashMap::new();
attrs2.insert("RX", BufValue::from("GGTA".to_string()));
attrs2.insert("xy", BufValue::from(4567i32));
unmapped.add_pair_with_attrs("q2", None, None, true, true, &attrs2);
let mut mapped_attrs1 = HashMap::new();
mapped_attrs1.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs1.insert("AS", BufValue::from(77i32));
mapped.add_pair_with_attrs("q1", Some(100), Some(200), true, true, &mapped_attrs1);
let mut mapped_attrs2 = HashMap::new();
mapped_attrs2.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs2.insert("AS", BufValue::from(16i32));
mapped.add_pair_with_attrs("q2", Some(500), Some(700), true, true, &mapped_attrs2);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 4);
for rec in &records {
assert!(rec.data().get(&Tag::from(SamTag::RX)).is_some());
assert!(rec.data().get(&Tag::new(b'x', b'y')).is_some());
assert!(rec.data().get(&Tag::from(SamTag::AS)).is_some());
assert!(rec.data().get(&Tag::from(SamTag::PG)).is_some());
assert!(rec.data().get(&Tag::from(SamTag::MC)).is_some(), "MC tag should be present");
assert!(rec.data().get(&Tag::from(SamTag::MQ)).is_some(), "MQ tag should be present");
}
Ok(())
}
#[test]
fn test_unpaired_reads() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs1 = HashMap::new();
attrs1.insert("RX", BufValue::from("ACGT".to_string()));
attrs1.insert("xy", BufValue::from(1234i32));
unmapped.add_frag_with_attrs("q1", None, true, &attrs1);
let mut attrs2 = HashMap::new();
attrs2.insert("RX", BufValue::from("GGTA".to_string()));
attrs2.insert("xy", BufValue::from(4567i32));
unmapped.add_frag_with_attrs("q2", None, true, &attrs2);
let mut mapped_attrs1 = HashMap::new();
mapped_attrs1.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs1.insert("AS", BufValue::from(77i32));
mapped.add_frag_with_attrs("q1", Some(100), true, &mapped_attrs1);
let mut mapped_attrs2 = HashMap::new();
mapped_attrs2.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs2.insert("AS", BufValue::from(16i32));
mapped.add_frag_with_attrs("q2", Some(500), false, &mapped_attrs2);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 2);
for rec in &records {
assert!(rec.data().get(&Tag::from(SamTag::RX)).is_some());
assert!(rec.data().get(&Tag::new(b'x', b'y')).is_some());
assert!(rec.data().get(&Tag::from(SamTag::AS)).is_some());
}
Ok(())
}
#[test]
fn test_tag_removal_reverse_revcomp() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("n1", BufValue::from(vec![1i16, 2, 3, 4, 5]));
attrs.insert("n2", BufValue::from(vec![2i16, 3, 4, 5, 6]));
attrs.insert("s1", BufValue::from("abcde".to_string()));
attrs.insert("s2", BufValue::from("vwxyz".to_string()));
attrs.insert("s3", BufValue::from("AGAGG".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs.insert("AS", BufValue::from(77i32));
mapped.add_pair_with_attrs("q1", Some(100), Some(200), true, false, &mapped_attrs);
let records = run_zipper(
&unmapped,
&mapped,
vec!["AS".to_string()],
vec!["n1".to_string(), "n2".to_string(), "s1".to_string()],
vec!["s3".to_string()],
)?;
assert_eq!(records.len(), 2);
for rec in &records {
assert!(rec.data().get(&Tag::from(SamTag::AS)).is_none());
if rec.flags().is_first_segment() {
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = rec.data().get(&Tag::new(b'n', b'1'))
{
assert_eq!(*vals, vec![1i16, 2, 3, 4, 5]);
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'1')) {
assert_eq!(s.to_string(), "abcde");
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'3')) {
assert_eq!(s.to_string(), "AGAGG");
}
} else {
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = rec.data().get(&Tag::new(b'n', b'1'))
{
assert_eq!(*vals, vec![5i16, 4, 3, 2, 1]);
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'1')) {
assert_eq!(s.to_string(), "edcba");
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'3')) {
assert_eq!(s.to_string(), "CCTCT");
}
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'2')) {
assert_eq!(s.to_string(), "vwxyz");
}
}
Ok(())
}
#[test]
fn test_consensus_tag_set() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("aD", BufValue::from("AAAGG".to_string()));
attrs.insert("bD", BufValue::from("AAAGC".to_string()));
attrs.insert("ad", BufValue::from(vec![3i16, 3, 4, 4, 2]));
unmapped.add_frag_with_attrs("q1", None, true, &attrs);
let mut mapped_attrs1 = HashMap::new();
mapped_attrs1.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs1.insert("AS", BufValue::from(77i32));
mapped.add_frag_with_attrs("q1", Some(100), true, &mapped_attrs1.clone());
let supp_rec = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(flags::SUPPLEMENTARY | flags::REVERSE)
.ref_id(0)
.pos(199)
.mapq(60)
.cigar_ops(&[encode_op(0, 100)])
.sequence(&b"A".repeat(100))
.qualities(&[30u8; 100]);
b.add_string_tag(b"PG", MAPPED_PG_ID.as_bytes());
b.add_int_tag(b"AS", 77);
to_record_buf(b.build())
};
mapped.push_record(supp_rec);
let records = run_zipper(
&unmapped,
&mapped,
vec![],
vec!["Consensus".to_string()],
vec!["Consensus".to_string()],
)?;
assert_eq!(records.len(), 2);
for rec in &records {
if rec.flags().is_reverse_complemented() {
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b'a', b'D')) {
assert_eq!(s.to_string(), "CCTTT");
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b'b', b'D')) {
assert_eq!(s.to_string(), "GCTTT");
}
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = rec.data().get(&Tag::new(b'a', b'd'))
{
assert_eq!(*vals, vec![2i16, 4, 4, 3, 3]);
}
} else {
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b'a', b'D')) {
assert_eq!(s.to_string(), "AAAGG");
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b'b', b'D')) {
assert_eq!(s.to_string(), "AAAGC");
}
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = rec.data().get(&Tag::new(b'a', b'd'))
{
assert_eq!(*vals, vec![3i16, 3, 4, 4, 2]);
}
}
}
Ok(())
}
#[test]
fn test_unmapped_only_reads() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs1 = HashMap::new();
attrs1.insert("RX", BufValue::from("ACGT".to_string()));
attrs1.insert("xy", BufValue::from(1234i32));
unmapped.add_frag_with_attrs("q1", None, true, &attrs1);
let mut attrs2 = HashMap::new();
attrs2.insert("RX", BufValue::from("GATA".to_string()));
attrs2.insert("xy", BufValue::from(3456i32));
unmapped.add_frag_with_attrs("q2", None, true, &attrs2);
let mut attrs3 = HashMap::new();
attrs3.insert("RX", BufValue::from("GGCG".to_string()));
attrs3.insert("xy", BufValue::from(5678i32));
unmapped.add_frag_with_attrs("q3", None, true, &attrs3);
let mut mapped_attrs1 = HashMap::new();
mapped_attrs1.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs1.insert("AS", BufValue::from(77i32));
mapped.add_frag_with_attrs("q1", Some(100), true, &mapped_attrs1);
let mut mapped_attrs3 = HashMap::new();
mapped_attrs3.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs3.insert("AS", BufValue::from(77i32));
mapped.add_frag_with_attrs("q3", Some(200), false, &mapped_attrs3);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 3);
let names: Vec<String> = records
.iter()
.map(|r| {
String::from_utf8(r.name().expect("record should have a name").as_bytes().to_vec())
.expect("record name should be valid UTF-8")
})
.collect();
assert_eq!(names, vec!["q1", "q2", "q3"]);
let q2 = records
.iter()
.find(|r| {
String::from_utf8(r.name().expect("record should have a name").as_bytes().to_vec())
.expect("record name should be valid UTF-8")
== "q2"
})
.expect("expected record q2 not found");
assert!(q2.flags().is_unmapped());
Ok(())
}
#[test]
fn test_tag_info_expansion() {
let tag_info =
TagInfo::new(vec![], vec!["Consensus".to_string()], vec!["Consensus".to_string()]);
assert!(tag_info.reverse.contains("ad"));
assert!(tag_info.reverse.contains("ae"));
assert!(tag_info.reverse.contains("bd"));
assert!(tag_info.reverse.contains("be"));
assert!(tag_info.reverse.contains("cd"));
assert!(tag_info.revcomp.contains("aD"));
assert!(tag_info.revcomp.contains("bD"));
assert!(tag_info.revcomp.contains("cD"));
}
#[test]
fn test_mate_info_fixing() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs.insert("AS", BufValue::from(77i32));
mapped.add_pair_with_attrs("q1", Some(100), Some(200), true, true, &mapped_attrs);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 2);
let r1 = records
.iter()
.find(|r| r.flags().is_first_segment())
.expect("expected R1 record not found");
let r2 = records
.iter()
.find(|r| !r.flags().is_first_segment())
.expect("expected R2 record not found");
let r1_mq = r1.data().get(&Tag::from(SamTag::MQ));
let r2_mq = r2.data().get(&Tag::from(SamTag::MQ));
assert!(r1_mq.is_some(), "R1 should have MQ tag");
assert!(r2_mq.is_some(), "R2 should have MQ tag");
let r1_mc = r1.data().get(&Tag::from(SamTag::MC));
let r2_mc = r2.data().get(&Tag::from(SamTag::MC));
assert!(r1_mc.is_some(), "R1 should have MC tag");
assert!(r2_mc.is_some(), "R2 should have MC tag");
Ok(())
}
#[test]
fn test_qc_flag_transfer() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs1 = HashMap::new();
attrs1.insert("RX", BufValue::from("ACGT".to_string()));
let r1_unmapped = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1").flags(flags::PAIRED | flags::FIRST_SEGMENT | flags::UNMAPPED);
b.add_string_tag(b"RX", b"ACGT");
to_record_buf(b.build())
};
let r2_unmapped = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(flags::PAIRED | flags::LAST_SEGMENT | flags::UNMAPPED | flags::QC_FAIL);
b.add_string_tag(b"RX", b"ACGT");
to_record_buf(b.build())
};
unmapped.push_record(r1_unmapped);
unmapped.push_record(r2_unmapped);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs.insert("AS", BufValue::from(77i32));
mapped.add_pair_with_attrs("q1", Some(100), Some(200), true, true, &mapped_attrs);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 2);
let r1 = records
.iter()
.find(|r| r.flags().is_first_segment())
.expect("expected R1 record not found");
let r2 = records
.iter()
.find(|r| !r.flags().is_first_segment())
.expect("expected R2 record not found");
assert!(
!r1.flags().is_qc_fail(),
"R1 should not have QC_FAIL flag (passed QC in unmapped)"
);
assert!(r2.flags().is_qc_fail(), "R2 should have QC_FAIL flag (failed QC in unmapped)");
Ok(())
}
#[test]
fn test_qc_flag_removal() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_frag_with_attrs("q1", None, true, &attrs);
let mapped_rec = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(flags::QC_FAIL)
.ref_id(0)
.pos(99)
.mapq(60)
.cigar_ops(&[encode_op(0, 100)])
.sequence(&b"A".repeat(100))
.qualities(&[30u8; 100]);
to_record_buf(b.build())
};
mapped.push_record(mapped_rec);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 1);
assert!(
!records[0].flags().is_qc_fail(),
"QC_FAIL flag should be removed when unmapped read passes QC"
);
Ok(())
}
#[test]
#[allow(clippy::too_many_lines)]
fn test_tag_operations_on_secondary_and_supplementary() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("n1", BufValue::from(vec![1i16, 2, 3, 4, 5]));
attrs.insert("n2", BufValue::from(vec![2i16, 3, 4, 5, 6]));
attrs.insert("s1", BufValue::from("abcde".to_string()));
attrs.insert("s2", BufValue::from("vwxyz".to_string()));
attrs.insert("s3", BufValue::from("AGAGG".to_string()));
unmapped.add_frag_with_attrs("q1", None, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs.insert("AS", BufValue::from(77i32));
mapped.add_frag_with_attrs("q1", Some(100), true, &mapped_attrs.clone());
let secondary_rec = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(flags::SECONDARY | flags::REVERSE)
.ref_id(0)
.pos(199)
.mapq(60)
.cigar_ops(&[encode_op(0, 100)])
.sequence(&b"A".repeat(100))
.qualities(&[30u8; 100]);
b.add_string_tag(b"PG", MAPPED_PG_ID.as_bytes());
b.add_int_tag(b"AS", 77);
to_record_buf(b.build())
};
mapped.push_record(secondary_rec);
let supp_rec = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(flags::SUPPLEMENTARY | flags::REVERSE)
.ref_id(0)
.pos(299)
.mapq(60)
.cigar_ops(&[encode_op(0, 100)])
.sequence(&b"A".repeat(100))
.qualities(&[30u8; 100]);
b.add_string_tag(b"PG", MAPPED_PG_ID.as_bytes());
b.add_int_tag(b"AS", 77);
to_record_buf(b.build())
};
mapped.push_record(supp_rec);
let records = run_zipper(
&unmapped,
&mapped,
vec!["AS".to_string()],
vec!["n1".to_string(), "n2".to_string(), "s1".to_string()],
vec!["s3".to_string()],
)?;
assert_eq!(records.len(), 3, "Should have 3 records (primary, secondary, supplementary)");
let secondary_and_supp_count = records
.iter()
.filter(|r| r.flags().is_secondary() || r.flags().is_supplementary())
.count();
assert_eq!(secondary_and_supp_count, 2, "Should have 2 secondary/supplementary records");
let neg_strand_count =
records.iter().filter(|r| r.flags().is_reverse_complemented()).count();
assert_eq!(neg_strand_count, 2, "Should have 2 negative strand records");
for rec in &records {
assert!(rec.data().get(&Tag::from(SamTag::AS)).is_none(), "AS tag should be removed");
if rec.flags().is_reverse_complemented() {
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = rec.data().get(&Tag::new(b'n', b'1'))
{
assert_eq!(
*vals,
vec![5i16, 4, 3, 2, 1],
"n1 should be reversed for negative strand"
);
}
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = rec.data().get(&Tag::new(b'n', b'2'))
{
assert_eq!(
*vals,
vec![6i16, 5, 4, 3, 2],
"n2 should be reversed for negative strand"
);
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'1')) {
assert_eq!(s.to_string(), "edcba", "s1 should be reversed for negative strand");
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'2')) {
assert_eq!(s.to_string(), "vwxyz", "s2 should not be changed");
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'3')) {
assert_eq!(
s.to_string(),
"CCTCT",
"s3 should be revcomped for negative strand"
);
}
} else {
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = rec.data().get(&Tag::new(b'n', b'1'))
{
assert_eq!(
*vals,
vec![1i16, 2, 3, 4, 5],
"n1 should be unchanged for positive strand"
);
}
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = rec.data().get(&Tag::new(b'n', b'2'))
{
assert_eq!(
*vals,
vec![2i16, 3, 4, 5, 6],
"n2 should be unchanged for positive strand"
);
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'1')) {
assert_eq!(
s.to_string(),
"abcde",
"s1 should be unchanged for positive strand"
);
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'2')) {
assert_eq!(s.to_string(), "vwxyz", "s2 should not be changed");
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'3')) {
assert_eq!(
s.to_string(),
"AGAGG",
"s3 should be unchanged for positive strand"
);
}
}
}
Ok(())
}
#[test]
fn test_multithreaded_processing() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
for i in 0..10 {
let name = format!("q{i}");
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from(format!("ACG{i}")));
attrs.insert("xy", BufValue::from(1000 + i as i32));
unmapped.add_pair_with_attrs(&name, None, None, true, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped_attrs.insert("AS", BufValue::from(77i32));
mapped.add_pair_with_attrs(
&name,
Some(100 + i * 100),
Some(200 + i * 100),
true,
true,
&mapped_attrs,
);
}
let dir = TempDir::new()?;
let unmapped_path = dir.path().join("unmapped.bam");
let mapped_path = dir.path().join("mapped.sam");
let output_path = dir.path().join("output.bam");
let dict_path = create_ref_dict(&dir, "chr1", REFERENCE_LENGTH)?;
unmapped.write(&unmapped_path)?;
mapped.write_sam(&mapped_path)?;
let zipper = Zipper {
input: mapped_path.clone(),
unmapped: unmapped_path.clone(),
reference: dict_path,
output: output_path.clone(),
tags_to_remove: vec![],
tags_to_reverse: vec![],
tags_to_revcomp: vec![],
buffer: 5000,
threads: 4,
compression: CompressionOptions { compression_level: 1 },
bwa_chunk_size: 150_000_000,
exclude_missing_reads: false,
skip_tc_tags: false,
restore_unconverted_bases: false,
};
zipper.execute("test")?;
let records = read_bam_records(&output_path)?;
assert_eq!(records.len(), 20);
for rec in &records {
assert!(rec.data().get(&Tag::from(SamTag::RX)).is_some());
assert!(rec.data().get(&Tag::new(b'x', b'y')).is_some());
assert!(rec.data().get(&Tag::from(SamTag::AS)).is_some());
}
Ok(())
}
#[test]
fn test_empty_unmapped() -> Result<()> {
let unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped.add_frag_with_attrs("q1", Some(100), true, &mapped_attrs);
let result = run_zipper(&unmapped, &mapped, vec![], vec![], vec![]);
assert!(result.is_err());
let err_msg = result.unwrap_err().to_string();
assert!(
err_msg.contains("mapped reads remaining"),
"Expected error about leftover mapped reads, got: {err_msg}"
);
Ok(())
}
#[test]
fn test_merge_mixed_strands() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
attrs.insert("xy", BufValue::from(1234i32));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped.add_pair_with_attrs("q1", Some(100), Some(200), true, true, &mapped_attrs);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 2);
for rec in &records {
assert!(rec.data().get(&Tag::from(SamTag::RX)).is_some());
assert!(rec.data().get(&Tag::new(b'x', b'y')).is_some());
}
Ok(())
}
#[test]
fn test_tag_info_has_revs_or_revcomps() {
let tag_info1 = TagInfo::new(vec![], vec![], vec![]);
assert!(!tag_info1.has_revs_or_revcomps());
let tag_info2 = TagInfo::new(vec![], vec!["n1".to_string()], vec![]);
assert!(tag_info2.has_revs_or_revcomps());
let tag_info3 = TagInfo::new(vec![], vec![], vec!["s1".to_string()]);
assert!(tag_info3.has_revs_or_revcomps());
let tag_info4 = TagInfo::new(vec![], vec!["n1".to_string()], vec!["s1".to_string()]);
assert!(tag_info4.has_revs_or_revcomps());
}
#[test]
fn test_multiple_secondary_alignments() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_frag_with_attrs("q1", None, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped.add_frag_with_attrs("q1", Some(100), true, &mapped_attrs.clone());
for i in 0..3 {
let secondary_rec = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(flags::SECONDARY)
.ref_id(0)
.pos(199 + i * 100)
.mapq(60)
.cigar_ops(&[encode_op(0, 100)])
.sequence(&b"A".repeat(100))
.qualities(&[30u8; 100]);
b.add_string_tag(b"PG", MAPPED_PG_ID.as_bytes());
to_record_buf(b.build())
};
mapped.push_record(secondary_rec);
}
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 4, "Should have 1 primary + 3 secondary alignments");
for rec in &records {
assert!(rec.data().get(&Tag::from(SamTag::RX)).is_some());
}
Ok(())
}
#[test]
fn test_r2_only_mapping() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let r2_mapped = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(flags::PAIRED | flags::LAST_SEGMENT)
.ref_id(0)
.pos(199)
.mapq(60)
.cigar_ops(&[encode_op(0, 100)])
.sequence(&b"A".repeat(100))
.qualities(&[30u8; 100]);
b.add_string_tag(b"PG", MAPPED_PG_ID.as_bytes());
to_record_buf(b.build())
};
mapped.push_record(r2_mapped);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 1, "Should have only R2 in output");
let r2 = &records[0];
assert!(!r2.flags().is_first_segment());
assert!(r2.data().get(&Tag::from(SamTag::RX)).is_some());
Ok(())
}
fn run_zipper_with_options(
unmapped: &FgSamBuilder,
mapped: &FgSamBuilder,
exclude_missing_reads: bool,
) -> Result<Vec<RecordBuf>> {
let dir = TempDir::new()?;
let unmapped_path = dir.path().join("unmapped.bam");
let mapped_path = dir.path().join("mapped.sam");
let output_path = dir.path().join("output.bam");
let dict_path = create_ref_dict(&dir, "chr1", REFERENCE_LENGTH)?;
unmapped.write(&unmapped_path)?;
mapped.write_sam(&mapped_path)?;
let zipper = Zipper {
input: mapped_path.clone(),
unmapped: unmapped_path.clone(),
reference: dict_path,
output: output_path.clone(),
tags_to_remove: vec![],
tags_to_reverse: vec![],
tags_to_revcomp: vec![],
buffer: 5000,
threads: 1,
compression: CompressionOptions { compression_level: 1 },
bwa_chunk_size: 150_000_000,
exclude_missing_reads,
skip_tc_tags: false,
restore_unconverted_bases: false,
};
zipper.execute("test")?;
read_bam_records(&output_path)
}
#[test]
fn test_both_inputs_empty() -> Result<()> {
let unmapped = FgSamBuilder::new_unmapped();
let mapped = FgSamBuilder::new_mapped();
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 0);
Ok(())
}
#[test]
fn test_exclude_missing_reads() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
unmapped.add_pair_with_attrs("q2", None, None, true, true, &attrs);
unmapped.add_pair_with_attrs("q3", None, None, true, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped.add_pair_with_attrs("q1", Some(100), Some(200), true, true, &mapped_attrs);
mapped.add_pair_with_attrs("q3", Some(300), Some(400), true, true, &mapped_attrs);
let records_with_missing = run_zipper_with_options(&unmapped, &mapped, false)?;
assert_eq!(records_with_missing.len(), 6, "Should have 6 records (3 pairs)");
let records_without_missing = run_zipper_with_options(&unmapped, &mapped, true)?;
assert_eq!(records_without_missing.len(), 4, "Should have 4 records (2 pairs)");
for rec in &records_without_missing {
if let Some(name) = rec.name() {
let name_str = String::from_utf8_lossy(name.as_ref());
assert_ne!(name_str, "q2", "q2 should be excluded");
}
}
Ok(())
}
#[test]
fn test_file_not_found_error() -> Result<()> {
let dir = TempDir::new()?;
let unmapped_path = dir.path().join("nonexistent_unmapped.bam");
let mapped_path = dir.path().join("mapped.sam");
let output_path = dir.path().join("output.bam");
let mapped = FgSamBuilder::new_mapped();
mapped.write_sam(&mapped_path)?;
let zipper = Zipper {
input: mapped_path,
unmapped: unmapped_path.clone(),
reference: dir.path().join("ref.fa"),
output: output_path,
tags_to_remove: vec![],
tags_to_reverse: vec![],
tags_to_revcomp: vec![],
buffer: 5000,
threads: 1,
compression: CompressionOptions { compression_level: 1 },
bwa_chunk_size: 150_000_000,
exclude_missing_reads: false,
skip_tc_tags: false,
restore_unconverted_bases: false,
};
let result = zipper.execute("test");
assert!(result.is_err(), "Should error when reference file doesn't exist");
let err_msg = result.unwrap_err().to_string();
assert!(
err_msg.contains("does not exist") || err_msg.contains("not found"),
"Error should mention file not found: {err_msg}"
);
Ok(())
}
const FLAG_PAIRED: u16 = 0x1;
const FLAG_READ1: u16 = 0x40;
const FLAG_READ2: u16 = 0x80;
const FLAG_SECONDARY: u16 = 0x100;
const FLAG_SUPPLEMENTARY: u16 = 0x800;
const FLAG_REVERSE: u16 = 0x10;
#[test]
fn test_add_pa_tag_to_supplementary_r1() -> Result<()> {
use crate::template::Template;
let primary_r1 = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ1)
.ref_id(0)
.pos(99)
.mapq(60)
.cigar_ops(&[encode_op(0, 8)])
.sequence(b"ACGTACGT")
.qualities(&[30u8; 8]);
b.build()
};
let supplementary_r1 = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ1 | FLAG_SUPPLEMENTARY)
.ref_id(5)
.pos(4999)
.mapq(60)
.cigar_ops(&[encode_op(0, 8)])
.sequence(b"ACGTACGT")
.qualities(&[30u8; 8]);
b.build()
};
let mut template = Template::from_records(vec![primary_r1, supplementary_r1])?;
add_template_coordinate_tags_raw(&mut template);
let supp = template
.records()
.iter()
.find(|r| r.is_supplementary())
.expect("Should have supplementary");
let tc_info = tc_info_from_raw(supp).expect("Supplementary should have pa tag");
assert_eq!(tc_info.tid1, 0, "tid1 should be R1's reference");
assert_eq!(tc_info.pos1, 100, "pos1 should be R1's position");
assert!(!tc_info.neg1, "neg1 should be false (forward strand)");
assert_eq!(tc_info.tid2, 0, "tid2 should equal tid1 (single read)");
assert_eq!(tc_info.pos2, 100, "pos2 should equal pos1 (single read)");
assert!(!tc_info.neg2, "neg2 should equal neg1 (single read)");
let primary = template.r1().expect("Should have primary R1");
assert!(tc_info_from_raw(primary).is_none(), "Primary should not have pa tag");
Ok(())
}
#[test]
fn test_add_pa_tag_to_secondary_reverse_strand() -> Result<()> {
use crate::template::Template;
let primary_r1 = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ1 | FLAG_REVERSE)
.ref_id(0)
.pos(199)
.mapq(60)
.cigar_ops(&[encode_op(0, 8)])
.sequence(b"ACGTACGT")
.qualities(&[30u8; 8]);
b.build()
};
let secondary_r1 = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ1 | FLAG_SECONDARY)
.ref_id(3)
.pos(2999)
.mapq(60)
.cigar_ops(&[encode_op(0, 8)])
.sequence(b"ACGTACGT")
.qualities(&[30u8; 8]);
b.build()
};
let mut template = Template::from_records(vec![primary_r1, secondary_r1])?;
add_template_coordinate_tags_raw(&mut template);
let secondary =
template.records().iter().find(|r| r.is_secondary()).expect("Should have secondary");
let tc_info = tc_info_from_raw(secondary).expect("Secondary should have pa tag");
assert_eq!(tc_info.tid1, 0, "tid1 should be R1's reference");
assert_eq!(tc_info.pos1, 207, "pos1 should be R1's unclipped 5' position");
assert!(tc_info.neg1, "neg1 should be true (reverse strand)");
Ok(())
}
#[test]
fn test_add_pa_tag_paired_end_r2_supplementary() -> Result<()> {
use crate::template::Template;
let primary_r1 = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ1)
.ref_id(0)
.pos(99)
.mapq(60)
.cigar_ops(&[encode_op(0, 8)])
.sequence(b"ACGTACGT")
.qualities(&[30u8; 8]);
b.build()
};
let primary_r2 = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ2 | FLAG_REVERSE)
.ref_id(0)
.pos(299)
.mapq(60)
.cigar_ops(&[encode_op(0, 8)])
.sequence(b"ACGTACGT")
.qualities(&[30u8; 8]);
b.build()
};
let supplementary_r2 = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ2 | FLAG_SUPPLEMENTARY)
.ref_id(5)
.pos(4999)
.mapq(60)
.cigar_ops(&[encode_op(0, 4)])
.sequence(b"ACGT")
.qualities(&[30u8; 4]);
b.build()
};
let mut template = Template::from_records(vec![primary_r1, primary_r2, supplementary_r2])?;
add_template_coordinate_tags_raw(&mut template);
let supp = template
.records()
.iter()
.find(|r| r.is_supplementary())
.expect("Should have supplementary");
let tc_info = tc_info_from_raw(supp).expect("Supplementary R2 should have pa tag");
assert_eq!(tc_info.tid1, 0, "tid1 should be R1's reference (earlier)");
assert_eq!(tc_info.pos1, 100, "pos1 should be R1's unclipped 5' position (earlier)");
assert!(!tc_info.neg1, "neg1 should be false (R1 forward strand)");
assert_eq!(tc_info.tid2, 0, "tid2 should be R2's reference (later)");
assert_eq!(tc_info.pos2, 307, "pos2 should be R2's unclipped 5' position (later)");
assert!(tc_info.neg2, "neg2 should be true (R2 reverse strand)");
Ok(())
}
#[test]
fn test_no_pa_tag_when_no_primary() -> Result<()> {
use crate::template::Template;
let supplementary = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ1 | FLAG_SUPPLEMENTARY)
.ref_id(5)
.pos(4999)
.mapq(60)
.cigar_ops(&[encode_op(0, 4)])
.sequence(b"ACGT")
.qualities(&[30u8; 4]);
b.build()
};
let mut template = Template::from_records(vec![supplementary])?;
add_template_coordinate_tags_raw(&mut template);
let supp = template
.records()
.iter()
.find(|r| r.is_supplementary())
.expect("Should have supplementary");
assert!(
tc_info_from_raw(supp).is_none(),
"Supplementary without primary should not have pa tag"
);
Ok(())
}
#[test]
fn test_pa_tag_added_during_merge() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let _ = mapped.add_pair().name("q1").start1(100).start2(200).build();
let supp = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ1 | FLAG_SUPPLEMENTARY)
.ref_id(0)
.pos(4999)
.mapq(60)
.cigar_ops(&[encode_op(0, 12)])
.sequence(b"ACGTACGTACGT")
.qualities(&[30u8; 12]);
to_record_buf(b.build())
};
mapped.push_record(supp);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
let supp_record = records
.iter()
.find(|r| r.flags().is_supplementary())
.expect("Should have supplementary in output");
let tc_value =
supp_record.data().get(&TC_TAG).expect("Supplementary should have pa tag after merge");
let tc_info = TemplateCoordinateInfo::from_tag_value(tc_value)
.expect("Should be able to parse pa tag");
assert_eq!(tc_info.tid1, 0, "tid1 should be 0");
assert_eq!(tc_info.pos1, 100, "pos1 should be R1's unclipped 5' position");
assert_eq!(tc_info.tid2, 0, "tid2 should be 0");
assert_eq!(tc_info.pos2, 299, "pos2 should be R2's unclipped 5' position (reverse strand)");
let rx_tag = Tag::from(SamTag::RX);
assert!(
supp_record.data().get(&rx_tag).is_some(),
"Supplementary should also have RX tag copied from unmapped"
);
Ok(())
}
#[test]
fn test_add_pa_tag_early_exit_no_secondary_supplementary() -> Result<()> {
use crate::template::Template;
let primary_r1 = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ1)
.ref_id(0)
.pos(99)
.mapq(60)
.cigar_ops(&[encode_op(0, 4)])
.sequence(b"ACGT")
.qualities(&[30u8; 4]);
b.build()
};
let primary_r2 = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ2 | FLAG_REVERSE)
.ref_id(0)
.pos(199)
.mapq(60)
.cigar_ops(&[encode_op(0, 4)])
.sequence(b"TGCA")
.qualities(&[30u8; 4]);
b.build()
};
let mut template = Template::from_records(vec![primary_r1, primary_r2])?;
add_template_coordinate_tags_raw(&mut template);
for record in template.records() {
assert!(tc_info_from_raw(record).is_none(), "Primary reads should not have pa tag");
}
Ok(())
}
#[test]
fn test_add_pa_tag_only_to_secondary_supplementary() -> Result<()> {
use crate::template::Template;
let primary_r1 = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ1)
.ref_id(0)
.pos(99)
.mapq(60)
.cigar_ops(&[encode_op(0, 4)])
.sequence(b"ACGT")
.qualities(&[30u8; 4]);
b.build()
};
let secondary_r1 = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ1 | FLAG_SECONDARY)
.ref_id(5)
.pos(4999)
.mapq(60)
.cigar_ops(&[encode_op(0, 4)])
.sequence(b"ACGT")
.qualities(&[30u8; 4]);
b.build()
};
let mut template = Template::from_records(vec![primary_r1, secondary_r1])?;
add_template_coordinate_tags_raw(&mut template);
for record in template.records() {
if record.is_secondary() {
assert!(tc_info_from_raw(record).is_some(), "Secondary should have pa tag");
} else {
assert!(tc_info_from_raw(record).is_none(), "Primary should not have pa tag");
}
}
Ok(())
}
#[test]
fn test_skip_tc_tags_parameter() -> Result<()> {
use crate::template::Template;
const FLAG_UNMAPPED: u16 = 0x4;
let primary_r1 = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ1)
.ref_id(0)
.pos(99)
.mapq(60)
.cigar_ops(&[encode_op(0, 4)])
.sequence(b"ACGT")
.qualities(&[30u8; 4]);
b.build()
};
let supplementary_r1 = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ1 | FLAG_SUPPLEMENTARY)
.ref_id(5)
.pos(4999)
.mapq(60)
.cigar_ops(&[encode_op(0, 4)])
.sequence(b"ACGT")
.qualities(&[30u8; 4]);
b.build()
};
let mut template = Template::from_records(vec![primary_r1, supplementary_r1])?;
let unmapped_record = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1").flags(FLAG_PAIRED | FLAG_READ1 | FLAG_UNMAPPED);
b.add_string_tag(b"RX", b"AAAA");
b.sequence(b"ACGT").qualities(&[30u8; 4]);
b.build()
};
let unmapped = Template::from_records(vec![unmapped_record])?;
let tag_info = TagInfo::new(vec![], vec![], vec![]);
merge_raw(&unmapped, &mut template, &tag_info, true)?;
for record in template.records() {
assert!(
tc_info_from_raw(record).is_none(),
"No records should have pa tag when skip_tc_tags=true"
);
}
Ok(())
}
#[test]
fn test_skip_tc_tags_false_adds_tags() -> Result<()> {
use crate::template::Template;
const FLAG_UNMAPPED: u16 = 0x4;
let primary_r1 = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ1)
.ref_id(0)
.pos(99)
.mapq(60)
.cigar_ops(&[encode_op(0, 4)])
.sequence(b"ACGT")
.qualities(&[30u8; 4]);
b.build()
};
let supplementary_r1 = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ1 | FLAG_SUPPLEMENTARY)
.ref_id(5)
.pos(4999)
.mapq(60)
.cigar_ops(&[encode_op(0, 4)])
.sequence(b"ACGT")
.qualities(&[30u8; 4]);
b.build()
};
let mut template = Template::from_records(vec![primary_r1, supplementary_r1])?;
let unmapped_record = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1").flags(FLAG_PAIRED | FLAG_READ1 | FLAG_UNMAPPED);
b.add_string_tag(b"RX", b"AAAA");
b.sequence(b"ACGT").qualities(&[30u8; 4]);
b.build()
};
let unmapped = Template::from_records(vec![unmapped_record])?;
let tag_info = TagInfo::new(vec![], vec![], vec![]);
merge_raw(&unmapped, &mut template, &tag_info, false)?;
let has_pa_tag = template
.records()
.iter()
.any(|r| r.is_supplementary() && tc_info_from_raw(r).is_some());
assert!(has_pa_tag, "Supplementary should have pa tag when skip_tc_tags=false");
Ok(())
}
#[test]
fn test_as_xs_normalization() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_frag_with_attrs("q1", None, true, &attrs);
let mapped_rec = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(0)
.ref_id(0)
.pos(99)
.mapq(60)
.cigar_ops(&[encode_op(0, 100)])
.sequence(&b"A".repeat(100))
.qualities(&[30u8; 100]);
b.add_string_tag(b"PG", MAPPED_PG_ID.as_bytes());
b.add_int_tag(b"AS", 77);
b.add_int_tag(b"XS", 50);
to_record_buf(b.build())
};
mapped.push_record(mapped_rec);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 1);
let as_val = records[0].data().get(&Tag::from(SamTag::AS)).unwrap();
assert!(matches!(as_val, BufValue::Int8(77)), "AS=77 should be Int8, got {as_val:?}");
let xs_val = records[0].data().get(&Tag::from(SamTag::XS)).unwrap();
assert!(matches!(xs_val, BufValue::Int8(50)), "XS=50 should be Int8, got {xs_val:?}");
Ok(())
}
#[test]
fn test_as_xs_normalization_large_values() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_frag_with_attrs("q1", None, true, &attrs);
let mapped_rec = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(0)
.ref_id(0)
.pos(99)
.mapq(60)
.cigar_ops(&[encode_op(0, 100)])
.sequence(&b"A".repeat(100))
.qualities(&[30u8; 100]);
b.add_string_tag(b"PG", MAPPED_PG_ID.as_bytes());
b.add_int_tag(b"AS", 200);
b.add_int_tag(b"XS", -200);
to_record_buf(b.build())
};
mapped.push_record(mapped_rec);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 1);
let as_val = records[0].data().get(&Tag::from(SamTag::AS)).unwrap();
assert!(matches!(as_val, BufValue::Int16(200)), "AS=200 should be Int16, got {as_val:?}");
let xs_val = records[0].data().get(&Tag::from(SamTag::XS)).unwrap();
assert!(matches!(xs_val, BufValue::Int16(-200)), "XS=-200 should be Int16, got {xs_val:?}");
Ok(())
}
#[test]
fn test_negative_strand_r1_tag_copying() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("n1", BufValue::from(vec![1i16, 2, 3]));
attrs.insert("s1", BufValue::from("ACGT".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped.add_pair_with_attrs("q1", Some(100), Some(200), false, true, &mapped_attrs);
let records =
run_zipper(&unmapped, &mapped, vec![], vec!["n1".to_string()], vec!["s1".to_string()])?;
assert_eq!(records.len(), 2);
let r1 = records.iter().find(|r| r.flags().is_first_segment()).unwrap();
let r2 = records.iter().find(|r| !r.flags().is_first_segment()).unwrap();
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = r1.data().get(&Tag::new(b'n', b'1'))
{
assert_eq!(*vals, vec![3i16, 2, 1], "n1 should be reversed for R1 (negative strand)");
} else {
panic!("n1 tag not found or wrong type on R1");
}
if let Some(BufValue::String(s)) = r1.data().get(&Tag::new(b's', b'1')) {
assert_eq!(s.to_string(), "ACGT", "s1 should be revcomped for R1 (negative strand)");
} else {
panic!("s1 tag not found or wrong type on R1");
}
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = r2.data().get(&Tag::new(b'n', b'1'))
{
assert_eq!(*vals, vec![1i16, 2, 3], "n1 should be unchanged for R2 (positive strand)");
} else {
panic!("n1 tag not found or wrong type on R2");
}
if let Some(BufValue::String(s)) = r2.data().get(&Tag::new(b's', b'1')) {
assert_eq!(s.to_string(), "ACGT", "s1 should be unchanged for R2 (positive strand)");
} else {
panic!("s1 tag not found or wrong type on R2");
}
Ok(())
}
#[test]
fn test_negative_strand_both_reads() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("n1", BufValue::from(vec![10i16, 20, 30]));
attrs.insert("s1", BufValue::from("AACC".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped.add_pair_with_attrs("q1", Some(200), Some(100), false, false, &mapped_attrs);
let records =
run_zipper(&unmapped, &mapped, vec![], vec!["n1".to_string()], vec!["s1".to_string()])?;
assert_eq!(records.len(), 2);
for rec in &records {
if let Some(BufValue::Array(
noodles::sam::alignment::record_buf::data::field::value::Array::Int16(vals),
)) = rec.data().get(&Tag::new(b'n', b'1'))
{
assert_eq!(*vals, vec![30i16, 20, 10], "n1 should be reversed on negative strand");
} else {
panic!("n1 tag not found or wrong type");
}
if let Some(BufValue::String(s)) = rec.data().get(&Tag::new(b's', b'1')) {
assert_eq!(s.to_string(), "GGTT", "s1 should be revcomped on negative strand");
} else {
panic!("s1 tag not found or wrong type");
}
}
Ok(())
}
#[test]
fn test_fix_mate_info_values() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let r1_mapped = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(flags::PAIRED | flags::FIRST_SEGMENT | flags::MATE_REVERSE)
.ref_id(0)
.pos(99)
.mapq(40)
.cigar_ops(&[encode_op(0, 50)])
.sequence(&b"A".repeat(50))
.qualities(&[30u8; 50]);
b.add_string_tag(b"PG", MAPPED_PG_ID.as_bytes());
to_record_buf(b.build())
};
let r2_mapped = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(flags::PAIRED | flags::LAST_SEGMENT | flags::REVERSE | 0x2)
.ref_id(0)
.pos(299)
.mapq(30)
.cigar_ops(&[encode_op(0, 75)])
.sequence(&b"A".repeat(75))
.qualities(&[30u8; 75]);
b.add_string_tag(b"PG", MAPPED_PG_ID.as_bytes());
to_record_buf(b.build())
};
mapped.push_record(r1_mapped);
mapped.push_record(r2_mapped);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 2);
let r1 = records.iter().find(|r| r.flags().is_first_segment()).unwrap();
let r2 = records.iter().find(|r| !r.flags().is_first_segment()).unwrap();
assert_eq!(
r1.mate_reference_sequence_id(),
r2.reference_sequence_id(),
"R1's RNEXT should equal R2's reference"
);
assert_eq!(
r1.mate_alignment_start(),
r2.alignment_start(),
"R1's PNEXT should equal R2's position"
);
assert_eq!(
r2.mate_reference_sequence_id(),
r1.reference_sequence_id(),
"R2's RNEXT should equal R1's reference"
);
assert_eq!(
r2.mate_alignment_start(),
r1.alignment_start(),
"R2's PNEXT should equal R1's position"
);
let r1_tlen = r1.template_length();
let r2_tlen = r2.template_length();
assert!(r1_tlen != 0, "R1 TLEN should be non-zero for mapped pair");
assert_eq!(r1_tlen, -r2_tlen, "R1 TLEN should be negation of R2 TLEN");
let r1_mq = r1.data().get(&Tag::from(SamTag::MQ));
let r2_mq = r2.data().get(&Tag::from(SamTag::MQ));
assert!(
matches!(r1_mq, Some(BufValue::Int8(30))),
"R1's MQ should be R2's mapq (30), got {r1_mq:?}"
);
assert!(
matches!(r2_mq, Some(BufValue::Int8(40))),
"R2's MQ should be R1's mapq (40), got {r2_mq:?}"
);
if let Some(BufValue::String(mc)) = r1.data().get(&Tag::from(SamTag::MC)) {
assert_eq!(mc.to_string(), "75M", "R1's MC should be R2's CIGAR");
} else {
panic!("R1 should have MC tag");
}
if let Some(BufValue::String(mc)) = r2.data().get(&Tag::from(SamTag::MC)) {
assert_eq!(mc.to_string(), "50M", "R2's MC should be R1's CIGAR");
} else {
panic!("R2 should have MC tag");
}
Ok(())
}
#[test]
fn test_fix_mate_info_supplementary() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let r1_mapped = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(flags::PAIRED | flags::FIRST_SEGMENT | flags::MATE_REVERSE)
.ref_id(0)
.pos(99)
.mapq(40)
.cigar_ops(&[encode_op(0, 50)])
.sequence(&b"A".repeat(50))
.qualities(&[30u8; 50]);
b.add_string_tag(b"PG", MAPPED_PG_ID.as_bytes());
b.add_int_tag(b"AS", 77);
to_record_buf(b.build())
};
let r2_mapped = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(flags::PAIRED | flags::LAST_SEGMENT | flags::REVERSE | 0x2)
.ref_id(0)
.pos(299)
.mapq(30)
.cigar_ops(&[encode_op(0, 75)])
.sequence(&b"A".repeat(75))
.qualities(&[30u8; 75]);
b.add_string_tag(b"PG", MAPPED_PG_ID.as_bytes());
b.add_int_tag(b"AS", 55);
to_record_buf(b.build())
};
mapped.push_record(r1_mapped);
mapped.push_record(r2_mapped);
let supp_rec = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(flags::PAIRED | flags::FIRST_SEGMENT | flags::SUPPLEMENTARY | flags::REVERSE)
.ref_id(0)
.pos(499)
.mapq(20)
.cigar_ops(&[encode_op(0, 60)])
.sequence(&b"A".repeat(60))
.qualities(&[30u8; 60]);
b.add_string_tag(b"PG", MAPPED_PG_ID.as_bytes());
b.add_int_tag(b"AS", 33);
to_record_buf(b.build())
};
mapped.push_record(supp_rec);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 3);
let r1_primary = records
.iter()
.find(|r| r.flags().is_first_segment() && !r.flags().is_supplementary())
.unwrap();
let r2_primary = records
.iter()
.find(|r| !r.flags().is_first_segment() && !r.flags().is_supplementary())
.unwrap();
let r1_supp = records
.iter()
.find(|r| r.flags().is_first_segment() && r.flags().is_supplementary())
.unwrap();
assert_eq!(
r1_supp.mate_reference_sequence_id(),
r2_primary.reference_sequence_id(),
"R1 supp RNEXT should point to R2 primary's reference"
);
assert_eq!(
r1_supp.mate_alignment_start(),
r2_primary.alignment_start(),
"R1 supp PNEXT should point to R2 primary's position"
);
let supp_mq = r1_supp.data().get(&Tag::from(SamTag::MQ));
assert!(
matches!(supp_mq, Some(BufValue::Int8(30))),
"R1 supp MQ should be R2 primary's mapq (30), got {supp_mq:?}"
);
if let Some(BufValue::String(mc)) = r1_supp.data().get(&Tag::from(SamTag::MC)) {
assert_eq!(mc.to_string(), "75M", "R1 supp MC should be R2 primary's CIGAR");
} else {
panic!("R1 supp should have MC tag");
}
let supp_ms = r1_supp.data().get(&Tag::from(SamTag::MS));
assert!(
matches!(supp_ms, Some(BufValue::Int8(55))),
"R1 supp ms should be R2 primary's AS (55), got {supp_ms:?}"
);
let r1_primary_tlen = r1_primary.template_length();
let r1_supp_tlen = r1_supp.template_length();
assert_eq!(
r1_supp_tlen, r1_primary_tlen,
"R1 supp TLEN should equal R1 primary TLEN (both = -R2_TLEN)"
);
Ok(())
}
#[test]
fn test_fix_mate_info_one_unmapped() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped.add_pair_with_attrs("q1", Some(100), None, true, true, &mapped_attrs);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 2);
let r1 = records.iter().find(|r| r.flags().is_first_segment()).unwrap();
let r2 = records.iter().find(|r| !r.flags().is_first_segment()).unwrap();
assert_eq!(r1.template_length(), 0, "R1 TLEN should be 0 when mate is unmapped");
assert_eq!(r2.template_length(), 0, "R2 TLEN should be 0 when mate is unmapped");
assert!(
r1.data().get(&Tag::from(SamTag::MQ)).is_none(),
"Mapped read should not have MQ when mate is unmapped"
);
assert!(
r1.data().get(&Tag::from(SamTag::MC)).is_none(),
"Mapped read should not have MC when mate is unmapped"
);
Ok(())
}
#[test]
fn test_fix_mate_info_r2_supplementary() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let r1_mapped = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(flags::PAIRED | flags::FIRST_SEGMENT | flags::MATE_REVERSE)
.ref_id(0)
.pos(99)
.mapq(40)
.cigar_ops(&[encode_op(0, 50)])
.sequence(&b"A".repeat(50))
.qualities(&[30u8; 50]);
b.add_string_tag(b"PG", MAPPED_PG_ID.as_bytes());
b.add_int_tag(b"AS", 77);
to_record_buf(b.build())
};
let r2_mapped = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(flags::PAIRED | flags::LAST_SEGMENT | flags::REVERSE | 0x2)
.ref_id(0)
.pos(299)
.mapq(30)
.cigar_ops(&[encode_op(0, 75)])
.sequence(&b"A".repeat(75))
.qualities(&[30u8; 75]);
b.add_string_tag(b"PG", MAPPED_PG_ID.as_bytes());
b.add_int_tag(b"AS", 55);
to_record_buf(b.build())
};
mapped.push_record(r1_mapped);
mapped.push_record(r2_mapped);
let supp_rec = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(flags::PAIRED | flags::LAST_SEGMENT | flags::SUPPLEMENTARY | flags::REVERSE)
.ref_id(0)
.pos(499)
.mapq(20)
.cigar_ops(&[encode_op(0, 60)])
.sequence(&b"A".repeat(60))
.qualities(&[30u8; 60]);
b.add_string_tag(b"PG", MAPPED_PG_ID.as_bytes());
b.add_int_tag(b"AS", 33);
to_record_buf(b.build())
};
mapped.push_record(supp_rec);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 3);
let r1_primary = records
.iter()
.find(|r| r.flags().is_first_segment() && !r.flags().is_supplementary())
.unwrap();
let r2_supp = records
.iter()
.find(|r| !r.flags().is_first_segment() && r.flags().is_supplementary())
.unwrap();
assert_eq!(
r2_supp.mate_reference_sequence_id(),
r1_primary.reference_sequence_id(),
"R2 supp RNEXT should point to R1 primary's reference"
);
assert_eq!(
r2_supp.mate_alignment_start(),
r1_primary.alignment_start(),
"R2 supp PNEXT should point to R1 primary's position"
);
let supp_mq = r2_supp.data().get(&Tag::from(SamTag::MQ));
assert!(
matches!(supp_mq, Some(BufValue::Int8(40))),
"R2 supp MQ should be R1 primary's mapq (40), got {supp_mq:?}"
);
if let Some(BufValue::String(mc)) = r2_supp.data().get(&Tag::from(SamTag::MC)) {
assert_eq!(mc.to_string(), "50M", "R2 supp MC should be R1 primary's CIGAR");
} else {
panic!("R2 supp should have MC tag");
}
let supp_ms = r2_supp.data().get(&Tag::from(SamTag::MS));
assert!(
matches!(supp_ms, Some(BufValue::Int8(77))),
"R2 supp ms should be R1 primary's AS (77), got {supp_ms:?}"
);
Ok(())
}
#[test]
fn test_fix_mate_info_both_unmapped() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
unmapped.add_pair_with_attrs("q1", None, None, true, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped.add_pair_with_attrs("q1", None, None, true, true, &mapped_attrs);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 2);
for rec in &records {
assert_eq!(rec.template_length(), 0, "TLEN should be 0 when both unmapped");
assert!(
rec.data().get(&Tag::from(SamTag::MQ)).is_none(),
"MQ should not exist when both unmapped"
);
assert!(
rec.data().get(&Tag::from(SamTag::MC)).is_none(),
"MC should not exist when both unmapped"
);
}
Ok(())
}
#[test]
fn test_varied_tag_types_roundtrip() -> Result<()> {
let mut unmapped = FgSamBuilder::new_unmapped();
let mut mapped = FgSamBuilder::new_mapped();
let mut attrs = HashMap::new();
attrs.insert("RX", BufValue::from("ACGT".to_string()));
attrs.insert("ca", BufValue::Character(b'X'));
attrs.insert("fi", BufValue::Float(1.23));
attrs.insert("ui", BufValue::UInt8(200));
attrs.insert("si", BufValue::Int16(-300));
attrs.insert("ul", BufValue::UInt32(100_000));
attrs.insert("hx", BufValue::Hex(bstr::BString::from("DEADBEEF")));
unmapped.add_frag_with_attrs("q1", None, true, &attrs);
let mut mapped_attrs = HashMap::new();
mapped_attrs.insert("PG", BufValue::from(MAPPED_PG_ID.to_string()));
mapped.add_frag_with_attrs("q1", Some(100), true, &mapped_attrs);
let records = run_zipper(&unmapped, &mapped, vec![], vec![], vec![])?;
assert_eq!(records.len(), 1);
let rec = &records[0];
assert!(
matches!(rec.data().get(&Tag::new(b'c', b'a')), Some(BufValue::Character(b'X'))),
"Character tag should roundtrip"
);
if let Some(BufValue::Float(f)) = rec.data().get(&Tag::new(b'f', b'i')) {
assert!((f - 1.23).abs() < 0.001, "Float tag should roundtrip, got {f}");
} else {
panic!("Float tag missing");
}
let ui_val = rec.data().get(&Tag::new(b'u', b'i'));
assert!(ui_val.is_some(), "UInt8 tag should exist");
let si_val = rec.data().get(&Tag::new(b's', b'i'));
assert!(
matches!(si_val, Some(BufValue::Int16(-300))),
"Int16 tag should roundtrip, got {si_val:?}"
);
let ul_val = rec.data().get(&Tag::new(b'u', b'l'));
assert!(ul_val.is_some(), "UInt32 tag should exist");
if let Some(BufValue::Hex(h)) = rec.data().get(&Tag::new(b'h', b'x')) {
assert_eq!(h.to_string(), "DEADBEEF", "Hex tag should roundtrip");
} else {
panic!("Hex tag missing or wrong type");
}
Ok(())
}
#[rstest]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam"], false)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--exclude-missing-reads"], true)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--exclude-missing-reads", "true"], true)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--exclude-missing-reads", "false"], false)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--exclude-missing-reads=true"], true)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--exclude-missing-reads=false"], false)]
fn test_exclude_missing_reads_parsing(#[case] args: &[&str], #[case] expected: bool) {
let cmd = Zipper::try_parse_from(args).expect("failed to parse Zipper arguments");
assert_eq!(cmd.exclude_missing_reads, expected);
}
#[rstest]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam"], false)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--skip-pa-tags"], true)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--skip-pa-tags", "true"], true)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--skip-pa-tags", "false"], false)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--skip-pa-tags=true"], true)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--skip-pa-tags=false"], false)]
fn test_skip_tc_tags_parsing(#[case] args: &[&str], #[case] expected: bool) {
let cmd = Zipper::try_parse_from(args).expect("failed to parse Zipper arguments");
assert_eq!(cmd.skip_tc_tags, expected);
}
#[rstest]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam"], false)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--restore-unconverted-bases"], true)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--restore-unconverted-bases", "true"], true)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--restore-unconverted-bases", "false"], false)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--restore-unconverted-bases=true"], true)]
#[case(&["zipper", "-u", "u.bam", "-r", "ref.fa", "-o", "out.bam", "--restore-unconverted-bases=false"], false)]
fn test_restore_unconverted_bases_parsing(#[case] args: &[&str], #[case] expected: bool) {
let cmd = Zipper::try_parse_from(args).expect("failed to parse Zipper arguments");
assert_eq!(cmd.restore_unconverted_bases, expected);
}
fn raw_sequence(rec: &RawRecord) -> Vec<u8> {
rec.view().sequence_vec()
}
fn raw_tag_absent(rec: &RawRecord, tag: [u8; 2]) -> bool {
fgumi_raw_bam::find_string_tag_in_record(rec.as_ref(), &tag).is_none()
&& fgumi_raw_bam::find_tag_type(fgumi_raw_bam::aux_data_slice(rec.as_ref()), &tag)
.is_none()
}
#[test]
fn test_raw_restore_unconverted_bases_top_strand() -> Result<()> {
use crate::sam::builder::create_test_fasta;
let fasta = create_test_fasta(&[("chr1", "ACGTACGTACGT")])?;
let reference = ReferenceReader::new(fasta.path())?;
let header: Header = "@HD\tVN:1.6\tSO:queryname\n@SQ\tSN:chr1\tLN:12\n".parse().unwrap();
let mut raw = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ1)
.ref_id(0)
.pos(0)
.mapq(60)
.cigar_ops(&[encode_op(0, 8)])
.sequence(b"ATGTATGT")
.qualities(&[30u8; 8]);
b.add_string_tag(b"YD", b"f");
b.build()
};
restore_unconverted_bases_in_raw_record(&mut raw, &reference, &header)?;
assert_eq!(raw_sequence(&raw), b"ACGTACGT");
Ok(())
}
#[test]
fn test_raw_restore_unconverted_bases_bottom_strand() -> Result<()> {
use crate::sam::builder::create_test_fasta;
let fasta = create_test_fasta(&[("chr1", "ACGTACGTACGT")])?;
let reference = ReferenceReader::new(fasta.path())?;
let header: Header = "@HD\tVN:1.6\tSO:queryname\n@SQ\tSN:chr1\tLN:12\n".parse().unwrap();
let mut raw = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ2)
.ref_id(0)
.pos(0)
.mapq(60)
.cigar_ops(&[encode_op(0, 8)])
.sequence(b"ACATACAT")
.qualities(&[30u8; 8]);
b.add_string_tag(b"YD", b"r");
b.build()
};
restore_unconverted_bases_in_raw_record(&mut raw, &reference, &header)?;
assert_eq!(raw_sequence(&raw), b"ACGTACGT");
Ok(())
}
#[test]
fn test_raw_restore_unconverted_bases_with_indels() -> Result<()> {
use crate::sam::builder::create_test_fasta;
let fasta = create_test_fasta(&[("chr1", "ACGTACGTACGT")])?;
let reference = ReferenceReader::new(fasta.path())?;
let header: Header = "@HD\tVN:1.6\tSO:queryname\n@SQ\tSN:chr1\tLN:12\n".parse().unwrap();
let mut raw = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ1)
.ref_id(0)
.pos(0)
.mapq(60)
.cigar_ops(&[
encode_op(0, 2),
encode_op(1, 1),
encode_op(0, 2),
encode_op(2, 1),
encode_op(0, 3),
])
.sequence(b"TTNATTGT")
.qualities(&[30u8; 8]);
b.add_string_tag(b"YD", b"f");
b.build()
};
restore_unconverted_bases_in_raw_record(&mut raw, &reference, &header)?;
assert_eq!(raw_sequence(&raw), b"TCNATCGT");
Ok(())
}
#[test]
fn test_raw_restore_unconverted_bases_skips_unmapped() -> Result<()> {
use crate::sam::builder::create_test_fasta;
let fasta = create_test_fasta(&[("chr1", "CCCCCCCC")])?;
let reference = ReferenceReader::new(fasta.path())?;
let header: Header = "@HD\tVN:1.6\tSO:queryname\n@SQ\tSN:chr1\tLN:8\n".parse().unwrap();
let mut raw = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1").flags(flags::UNMAPPED).sequence(b"TTTTTTTT").qualities(&[30u8; 8]);
b.add_string_tag(b"YD", b"f");
b.build()
};
restore_unconverted_bases_in_raw_record(&mut raw, &reference, &header)?;
assert_eq!(raw_sequence(&raw), b"TTTTTTTT");
Ok(())
}
#[test]
fn test_raw_restore_unconverted_bases_skips_no_yd_tag() -> Result<()> {
use crate::sam::builder::create_test_fasta;
let fasta = create_test_fasta(&[("chr1", "CCCCCCCC")])?;
let reference = ReferenceReader::new(fasta.path())?;
let header: Header = "@HD\tVN:1.6\tSO:queryname\n@SQ\tSN:chr1\tLN:8\n".parse().unwrap();
let mut raw = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ1)
.ref_id(0)
.pos(0)
.mapq(60)
.cigar_ops(&[encode_op(0, 8)])
.sequence(b"TTTTTTTT")
.qualities(&[30u8; 8]);
b.build()
};
restore_unconverted_bases_in_raw_record(&mut raw, &reference, &header)?;
assert_eq!(raw_sequence(&raw), b"TTTTTTTT");
Ok(())
}
#[test]
fn test_raw_restore_unconverted_bases_preserves_already_unconverted() -> Result<()> {
use crate::sam::builder::create_test_fasta;
let fasta = create_test_fasta(&[("chr1", "CCCCCCCC")])?;
let reference = ReferenceReader::new(fasta.path())?;
let header: Header = "@HD\tVN:1.6\tSO:queryname\n@SQ\tSN:chr1\tLN:8\n".parse().unwrap();
let mut raw = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ1)
.ref_id(0)
.pos(0)
.mapq(60)
.cigar_ops(&[encode_op(0, 8)])
.sequence(b"CTCACTGC")
.qualities(&[30u8; 8]);
b.add_string_tag(b"YD", b"f");
b.build()
};
restore_unconverted_bases_in_raw_record(&mut raw, &reference, &header)?;
assert_eq!(raw_sequence(&raw), b"CCCACCGC");
Ok(())
}
#[test]
fn test_raw_restore_unconverted_bases_no_target_in_span() -> Result<()> {
use crate::sam::builder::create_test_fasta;
let fasta = create_test_fasta(&[("chr1", "ATGTATGT")])?;
let reference = ReferenceReader::new(fasta.path())?;
let header: Header = "@HD\tVN:1.6\tSO:queryname\n@SQ\tSN:chr1\tLN:8\n".parse().unwrap();
let mut raw = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ1)
.ref_id(0)
.pos(0)
.mapq(60)
.cigar_ops(&[encode_op(0, 8)])
.sequence(b"ATGTATGT")
.qualities(&[30u8; 8]);
b.add_string_tag(b"YD", b"f");
b.add_int_tag(b"NM", 0);
b.add_string_tag(b"MD", b"8");
b.build()
};
restore_unconverted_bases_in_raw_record(&mut raw, &reference, &header)?;
assert_eq!(raw_sequence(&raw), b"ATGTATGT");
assert!(!raw_tag_absent(&raw, *b"NM"), "NM should remain when no bases were changed");
assert!(!raw_tag_absent(&raw, *b"MD"), "MD should remain when no bases were changed");
Ok(())
}
#[test]
fn test_raw_restore_unconverted_bases_top_strand_reverse() -> Result<()> {
use crate::sam::builder::create_test_fasta;
let fasta = create_test_fasta(&[("chr1", "ACGTACGTACGT")])?;
let reference = ReferenceReader::new(fasta.path())?;
let header: Header = "@HD\tVN:1.6\tSO:queryname\n@SQ\tSN:chr1\tLN:12\n".parse().unwrap();
let mut raw = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ1 | FLAG_REVERSE)
.ref_id(0)
.pos(0)
.mapq(60)
.cigar_ops(&[encode_op(0, 8)])
.sequence(b"ACATACGT")
.qualities(&[30u8; 8]);
b.add_string_tag(b"YD", b"f");
b.build()
};
restore_unconverted_bases_in_raw_record(&mut raw, &reference, &header)?;
assert_eq!(raw_sequence(&raw), b"ACGTACGT");
Ok(())
}
#[test]
fn test_raw_restore_unconverted_bases_bottom_strand_reverse() -> Result<()> {
use crate::sam::builder::create_test_fasta;
let fasta = create_test_fasta(&[("chr1", "ACGTACGTACGT")])?;
let reference = ReferenceReader::new(fasta.path())?;
let header: Header = "@HD\tVN:1.6\tSO:queryname\n@SQ\tSN:chr1\tLN:12\n".parse().unwrap();
let mut raw = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ2 | FLAG_REVERSE)
.ref_id(0)
.pos(0)
.mapq(60)
.cigar_ops(&[encode_op(0, 8)])
.sequence(b"ATGTACGT")
.qualities(&[30u8; 8]);
b.add_string_tag(b"YD", b"r");
b.build()
};
restore_unconverted_bases_in_raw_record(&mut raw, &reference, &header)?;
assert_eq!(raw_sequence(&raw), b"ACGTACGT");
Ok(())
}
#[test]
fn test_raw_restore_removes_nm_md_when_bases_changed() -> Result<()> {
use crate::sam::builder::create_test_fasta;
let fasta = create_test_fasta(&[("chr1", "ACGTACGTACGT")])?;
let reference = ReferenceReader::new(fasta.path())?;
let header: Header = "@HD\tVN:1.6\tSO:queryname\n@SQ\tSN:chr1\tLN:12\n".parse().unwrap();
let mut raw = {
let mut b = RawSamBuilder::new();
b.read_name(b"q1")
.flags(FLAG_PAIRED | FLAG_READ1)
.ref_id(0)
.pos(0)
.mapq(60)
.cigar_ops(&[encode_op(0, 8)])
.sequence(b"ATGTATGT")
.qualities(&[30u8; 8]);
b.add_string_tag(b"YD", b"f");
b.add_int_tag(b"NM", 2);
b.add_string_tag(b"MD", b"1T3T2");
b.build()
};
restore_unconverted_bases_in_raw_record(&mut raw, &reference, &header)?;
assert_eq!(raw_sequence(&raw), b"ACGTACGT");
assert!(raw_tag_absent(&raw, *b"NM"), "NM should be removed when bases were changed");
assert!(raw_tag_absent(&raw, *b"MD"), "MD should be removed when bases were changed");
Ok(())
}
}