use crate::alignment_tags::regenerate_alignment_tags_raw;
use crate::bam_io::{
create_bam_reader_for_pipeline, create_raw_bam_reader, create_raw_bam_writer, is_stdin_path,
};
use crate::clipper::{ClippingMode, RawRecordClipper};
use crate::grouper::TemplateGrouper;
use crate::logging::OperationTimer;
use crate::metrics::clip::{ClipCounts, ClippingMetricsCollection};
use crate::metrics::writer::write_metrics as write_metrics_tsv;
use crate::per_thread_accumulator::PerThreadAccumulator;
use crate::progress::ProgressTracker;
use crate::reference::ReferenceReader;
use crate::template::{TemplateBatch, TemplateIterator};
use crate::unified_pipeline::{
GroupKeyConfig, Grouper, MemoryEstimate, run_bam_pipeline_from_reader,
};
use crate::validation::validate_file_exists;
use anyhow::Result;
use clap::Parser;
use fgumi_raw_bam::RawRecord;
use log::info;
use noodles::sam::Header;
use std::io;
use std::path::PathBuf;
use std::sync::Arc;
use std::sync::atomic::{AtomicU64, Ordering};
use super::command::Command;
use super::common::{
BamIoOptions, CompressionOptions, QueueMemoryOptions, SchedulerOptions, ThreadingOptions,
build_pipeline_config, parse_bool,
};
#[derive(Parser, Debug)]
#[command(
name = "clip",
about = "\x1b[38;5;173m[POST-CONSENSUS]\x1b[0m \x1b[36mClip overlapping reads in BAM files\x1b[0m",
long_about = r#"
Clips reads from the same template. Ensures that at least N bases are clipped from any end of the read (i.e.
R1 5' end, R1 3' end, R2 5' end, and R2 3' end). Optionally clips reads from the same template to eliminate overlap
between the reads. This ensures that downstream processes, particularly variant calling, cannot double-count
evidence from the same template when both reads span a variant site in the same template.
Clipping overlapping reads is only performed on FR read pairs, and is implemented by clipping approximately half
the overlapping bases from each read. By default soft clipping is performed.
Secondary alignments and supplemental alignments are not clipped, but are passed through into the output.
In order to correctly clip reads by template and update mate information, the input BAM must be either
queryname sorted or query grouped. If your input BAM is not in an appropriate order the sort can be
done in streaming fashion with, for example:
fgumi sort -i in.bam --order queryname | fgumi clip -i /dev/stdin ...
The output sort order may be specified with --sort-order. If not given, then the output will be in the same
order as input.
Any existing NM, UQ and MD tags are repaired, and mate-pair information is updated.
Three clipping modes are supported:
1. `soft` - soft-clip the bases and qualities.
2. `soft-with-mask` - soft-clip and mask the bases and qualities (make bases Ns and qualities the minimum).
3. `hard` - hard-clip the bases and qualities.
The --upgrade-clipping parameter will convert all existing clipping in the input to the given more stringent mode:
from `soft` to either `soft-with-mask` or `hard`, and `soft-with-mask` to `hard`. In all other cases, clipping remains
the same prior to applying any other clipping criteria.
"#
)]
#[allow(clippy::struct_excessive_bools)]
pub struct Clip {
#[command(flatten)]
pub io: BamIoOptions,
#[arg(short = 'r', long = "reference", alias = "ref", required = true)]
pub reference: PathBuf,
#[arg(short = 'c', long = "clipping-mode", default_value_t = ClippingMode::Hard)]
pub clipping_mode: ClippingMode,
#[arg(
short = 'S',
long = "sort-order",
value_parser = ["unknown", "unsorted", "queryname", "coordinate"]
)]
pub sort_order: Option<String>,
#[arg(long = "clip-overlapping-reads", default_value = "false", num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set, value_parser = parse_bool)]
pub clip_overlapping_reads: bool,
#[arg(
long = "clip-bases-past-mate",
alias = "clip-extending-past-mate",
default_value = "false",
num_args = 0..=1,
default_missing_value = "true",
action = clap::ArgAction::Set,
value_parser = parse_bool,
)]
pub clip_extending_past_mate: bool,
#[arg(long = "read-one-five-prime", default_value = "0")]
pub read_one_five_prime: usize,
#[arg(long = "read-one-three-prime", default_value = "0")]
pub read_one_three_prime: usize,
#[arg(long = "read-two-five-prime", default_value = "0")]
pub read_two_five_prime: usize,
#[arg(long = "read-two-three-prime", default_value = "0")]
pub read_two_three_prime: usize,
#[arg(short = 'H', long = "upgrade-clipping", default_value = "false", num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set, value_parser = parse_bool)]
pub upgrade_clipping: bool,
#[arg(short = 'a', long = "auto-clip-attributes", default_value = "false", num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set, value_parser = parse_bool)]
pub auto_clip_attributes: bool,
#[arg(short = 'm', long = "metrics")]
pub metrics: Option<PathBuf>,
#[command(flatten)]
pub threading: ThreadingOptions,
#[command(flatten)]
pub compression: CompressionOptions,
#[command(flatten)]
pub scheduler_opts: SchedulerOptions,
#[command(flatten)]
pub queue_memory: QueueMemoryOptions,
}
struct ClipProcessedBatch {
clipped_records: Vec<RawRecord>,
templates_count: u64,
overlap_clipped_count: u64,
extend_clipped_count: u64,
}
impl MemoryEstimate for ClipProcessedBatch {
fn estimate_heap_size(&self) -> usize {
self.clipped_records.iter().map(MemoryEstimate::estimate_heap_size).sum::<usize>()
+ self.clipped_records.capacity() * std::mem::size_of::<RawRecord>()
}
}
#[derive(Default)]
struct CollectedClipMetrics {
total_templates: u64,
overlap_clipped: u64,
extend_clipped: u64,
}
impl Command for Clip {
#[allow(clippy::too_many_lines)]
fn execute(&self, command_line: &str) -> Result<()> {
if !is_stdin_path(&self.io.input) {
validate_file_exists(&self.io.input, "Input BAM")?;
}
validate_file_exists(&self.reference, "Reference FASTA")?;
info!("Clip");
info!(" Input: {}", self.io.input.display());
info!(" Output: {}", self.io.output.display());
info!(" Clipping mode: {}", self.clipping_mode);
info!(" Clip overlapping reads: {}", self.clip_overlapping_reads);
info!(" Clip extending past mate: {}", self.clip_extending_past_mate);
info!(" {}", self.threading.log_message());
let timer = OperationTimer::new("Clipping reads");
let mode = self.clipping_mode;
if self.upgrade_clipping
|| self.clip_overlapping_reads
|| self.clip_extending_past_mate
|| self.read_one_five_prime > 0
|| self.read_one_three_prime > 0
|| self.read_two_five_prime > 0
|| self.read_two_three_prime > 0
{
} else {
anyhow::bail!("At least one clipping option is required");
}
if self.threading.threads.is_some() {
if let Some(path) = &self.metrics {
anyhow::bail!(
"--metrics {} cannot be used with --threads: detailed clipping metrics \
are only produced by the single-threaded path",
path.display()
);
}
}
let total_records = if let Some(threads) = self.threading.threads {
let (reader, header) = create_bam_reader_for_pipeline(&self.io.input)?;
let reference = Arc::new(ReferenceReader::new(&self.reference)?);
let header = self.update_header_sort_order(header)?;
let header = crate::commands::common::add_pg_record(header, command_line)?;
self.execute_threads_mode(reader, threads, header, reference)?
} else {
self.execute_single_threaded(mode, command_line)?
};
timer.log_completion(total_records);
Ok(())
}
}
impl Clip {
#[allow(clippy::too_many_lines)]
fn execute_single_threaded(&self, mode: ClippingMode, command_line: &str) -> Result<u64> {
let clipper = if self.auto_clip_attributes {
RawRecordClipper::with_auto_clip(mode, true)
} else {
RawRecordClipper::new(mode)
};
let mut metrics_collection =
self.metrics.as_ref().map(|_| ClippingMetricsCollection::new());
let reference_reader = ReferenceReader::new(&self.reference)?;
let reader_threads = self.threading.num_threads();
let (reader, header) = create_raw_bam_reader(&self.io.input, reader_threads)?;
let header = self.update_header_sort_order(header)?;
let header = crate::commands::common::add_pg_record(header, command_line)?;
let writer_threads = self.threading.num_threads();
let mut writer = create_raw_bam_writer(
&self.io.output,
&header,
writer_threads,
self.compression.compression_level,
)?;
info!("Processing templates...");
let mut total_records: usize = 0;
let mut total_clipped_overlap: u64 = 0;
let mut total_clipped_mate_extension: u64 = 0;
let progress = ProgressTracker::new("Processed records").with_interval(1_000_000);
let template_iter = TemplateIterator::new(reader);
for template in template_iter {
let template = template?;
let mut records: Vec<RawRecord> = template.into_records();
#[allow(clippy::len_zero)] if records.len() == 1 {
let record = &mut records[0];
self.clip_fragment(&clipper, record, metrics_collection.as_mut())?;
} else if records.len() == 2 {
let (r1, r2) = records.split_at_mut(1);
let r1 = &mut r1[0];
let r2 = &mut r2[0];
let (overlap_clip, extend_clip) =
self.clip_pair(&clipper, r1, r2, metrics_collection.as_mut())?;
if overlap_clip {
total_clipped_overlap += 1;
}
if extend_clip {
total_clipped_mate_extension += 1;
}
update_mate_info_raw(r1, r2);
update_mate_info_raw(r2, r1);
}
for record in &mut records {
regenerate_alignment_tags_raw(record.as_mut_vec(), &header, &reference_reader)?;
}
let batch_size = records.len();
total_records += batch_size;
for record in &records {
writer.write_raw_record(record.as_ref())?;
}
progress.log_if_needed(batch_size as u64);
}
progress.log_final();
info!("Total records processed: {total_records}");
info!("Templates with overlap clipping: {total_clipped_overlap}");
info!("Templates with mate extension clipping: {total_clipped_mate_extension}");
if let Some(metrics_path) = &self.metrics {
if let Some(mut metrics) = metrics_collection {
info!("Writing metrics to {}", metrics_path.display());
metrics.finalize();
let all_metrics = metrics.all_metrics();
write_metrics_tsv(metrics_path, &all_metrics, "clipping")?;
info!("Metrics written successfully");
}
}
writer.finish()?;
info!("Done!");
Ok(total_records as u64)
}
fn clip_fragment(
&self,
clipper: &RawRecordClipper,
record: &mut RawRecord,
metrics: Option<&mut ClippingMetricsCollection>,
) -> Result<()> {
let prior_bases_clipped = clipped_bases_raw(record);
if self.upgrade_clipping {
clipper.upgrade_all_clipping_raw(record)?;
}
let num_five_prime = if self.read_one_five_prime > 0 {
clipper.clip_5_prime_end_of_alignment(record, self.read_one_five_prime)
} else {
0
};
let num_three_prime = if self.read_one_three_prime > 0 {
clipper.clip_3_prime_end_of_alignment(record, self.read_one_three_prime)
} else {
0
};
if let Some(metrics) = metrics {
metrics.fragment.update_raw(
record,
ClipCounts {
prior: prior_bases_clipped,
five_prime: num_five_prime,
three_prime: num_three_prime,
..ClipCounts::default()
},
);
}
Ok(())
}
fn clip_pair(
&self,
clipper: &RawRecordClipper,
r1: &mut RawRecord,
r2: &mut RawRecord,
metrics: Option<&mut ClippingMetricsCollection>,
) -> Result<(bool, bool)> {
let prior_bases_clipped_r1 = clipped_bases_raw(r1);
let prior_bases_clipped_r2 = clipped_bases_raw(r2);
if self.upgrade_clipping {
clipper.upgrade_all_clipping_raw(r1)?;
clipper.upgrade_all_clipping_raw(r2)?;
}
let (is_r1_first, is_r2_last) = (r1.is_first_segment(), r2.is_last_segment());
let num_r1_five_prime = if is_r1_first && self.read_one_five_prime > 0 {
clipper.clip_5_prime_end_of_alignment(r1, self.read_one_five_prime)
} else if !is_r1_first && self.read_two_five_prime > 0 {
clipper.clip_5_prime_end_of_alignment(r1, self.read_two_five_prime)
} else {
0
};
let num_r1_three_prime = if is_r1_first && self.read_one_three_prime > 0 {
clipper.clip_3_prime_end_of_alignment(r1, self.read_one_three_prime)
} else if !is_r1_first && self.read_two_three_prime > 0 {
clipper.clip_3_prime_end_of_alignment(r1, self.read_two_three_prime)
} else {
0
};
let num_r2_five_prime = if is_r2_last && self.read_two_five_prime > 0 {
clipper.clip_5_prime_end_of_alignment(r2, self.read_two_five_prime)
} else if !is_r2_last && self.read_one_five_prime > 0 {
clipper.clip_5_prime_end_of_alignment(r2, self.read_one_five_prime)
} else {
0
};
let num_r2_three_prime = if is_r2_last && self.read_two_three_prime > 0 {
clipper.clip_3_prime_end_of_alignment(r2, self.read_two_three_prime)
} else if !is_r2_last && self.read_one_three_prime > 0 {
clipper.clip_3_prime_end_of_alignment(r2, self.read_one_three_prime)
} else {
0
};
let (num_overlapping_r1, num_overlapping_r2) = if self.clip_overlapping_reads {
clipper.clip_overlapping_reads(r1, r2)
} else {
(0, 0)
};
let (num_extending_r1, num_extending_r2) = if self.clip_extending_past_mate {
clipper.clip_extending_past_mate_ends(r1, r2)
} else {
(0, 0)
};
if let Some(metrics) = metrics {
let r1_counts = ClipCounts {
prior: prior_bases_clipped_r1,
five_prime: num_r1_five_prime,
three_prime: num_r1_three_prime,
overlapping: num_overlapping_r1,
extending: num_extending_r1,
};
let r2_counts = ClipCounts {
prior: prior_bases_clipped_r2,
five_prime: num_r2_five_prime,
three_prime: num_r2_three_prime,
overlapping: num_overlapping_r2,
extending: num_extending_r2,
};
if is_r1_first {
metrics.read_one.update_raw(r1, r1_counts);
} else {
metrics.read_two.update_raw(r1, r1_counts);
}
if is_r2_last {
metrics.read_two.update_raw(r2, r2_counts);
} else {
metrics.read_one.update_raw(r2, r2_counts);
}
}
let overlap_clipped = num_overlapping_r1 > 0 || num_overlapping_r2 > 0;
let extend_clipped = num_extending_r1 > 0 || num_extending_r2 > 0;
Ok((overlap_clipped, extend_clipped))
}
fn update_header_sort_order(&self, header: Header) -> Result<Header> {
if let Some(ref sort_order) = self.sort_order {
use bstr::BString;
use noodles::sam::header::record::value::Map;
use noodles::sam::header::record::value::map::header::tag::SORT_ORDER;
let mut header_map = if let Some(hd) = header.header() {
hd.clone()
} else {
Map::<noodles::sam::header::record::value::map::Header>::default()
};
*header_map.other_fields_mut().entry(SORT_ORDER).or_insert(BString::from("")) =
BString::from(sort_order.as_str());
let mut builder = noodles::sam::Header::builder();
for (name, rg) in header.read_groups() {
builder = builder.add_read_group(name.clone(), rg.clone());
}
for (name, reference) in header.reference_sequences() {
builder = builder.add_reference_sequence(name.clone(), reference.clone());
}
for (id, pg) in header.programs().as_ref() {
builder = builder.add_program(id.clone(), pg.clone());
}
for comment in header.comments() {
builder = builder.add_comment(comment.clone());
}
builder = builder.set_header(header_map);
Ok(builder.build())
} else {
Ok(header)
}
}
#[allow(clippy::too_many_lines)]
fn execute_threads_mode(
&self,
reader: Box<dyn std::io::Read + Send>,
num_threads: usize,
header: Header,
reference: Arc<ReferenceReader>,
) -> Result<u64> {
let mut pipeline_config = build_pipeline_config(
&self.scheduler_opts,
&self.compression,
&self.queue_memory,
num_threads,
)?;
pipeline_config.group_key_config =
Some(GroupKeyConfig::new_raw_no_cell(crate::read_info::LibraryIndex::default()));
let collected_metrics = PerThreadAccumulator::<CollectedClipMetrics>::new(num_threads);
let collected_for_serialize = Arc::clone(&collected_metrics);
const BATCH_SIZE: usize = 1000;
let clipping_mode = self.clipping_mode;
let auto_clip_attributes = self.auto_clip_attributes;
let upgrade_clipping = self.upgrade_clipping;
let clip_overlapping_reads = self.clip_overlapping_reads;
let clip_extending_past_mate = self.clip_extending_past_mate;
let read_one_five_prime = self.read_one_five_prime;
let read_one_three_prime = self.read_one_three_prime;
let read_two_five_prime = self.read_two_five_prime;
let read_two_three_prime = self.read_two_three_prime;
let reference_for_process = Arc::clone(&reference);
let header_for_process = header.clone();
let progress_counter = Arc::new(AtomicU64::new(0));
let progress_for_process = Arc::clone(&progress_counter);
let grouper_fn = move |_header: &Header| {
Box::new(TemplateGrouper::new(BATCH_SIZE))
as Box<dyn Grouper<Group = TemplateBatch> + Send>
};
let process_fn = move |batch: TemplateBatch| -> io::Result<ClipProcessedBatch> {
let clipper = if auto_clip_attributes {
RawRecordClipper::with_auto_clip(clipping_mode, true)
} else {
RawRecordClipper::new(clipping_mode)
};
let mut clipped_records = Vec::new();
let mut templates_count = 0u64;
let mut overlap_clipped_count = 0u64;
let mut extend_clipped_count = 0u64;
for template in batch {
templates_count += 1;
let mut records: Vec<RawRecord> = template.into_records();
#[allow(clippy::len_zero)]
if records.len() == 1 {
let record = &mut records[0];
if upgrade_clipping {
clipper.upgrade_all_clipping_raw(record).map_err(io::Error::other)?;
}
if read_one_five_prime > 0 {
clipper.clip_5_prime_end_of_alignment(record, read_one_five_prime);
}
if read_one_three_prime > 0 {
clipper.clip_3_prime_end_of_alignment(record, read_one_three_prime);
}
} else if records.len() == 2 {
let (r1_slice, r2_slice) = records.split_at_mut(1);
let r1 = &mut r1_slice[0];
let r2 = &mut r2_slice[0];
if upgrade_clipping {
clipper.upgrade_all_clipping_raw(r1).map_err(io::Error::other)?;
clipper.upgrade_all_clipping_raw(r2).map_err(io::Error::other)?;
}
let is_r1_first = r1.is_first_segment();
let is_r2_last = r2.is_last_segment();
if is_r1_first && read_one_five_prime > 0 {
clipper.clip_5_prime_end_of_alignment(r1, read_one_five_prime);
} else if !is_r1_first && read_two_five_prime > 0 {
clipper.clip_5_prime_end_of_alignment(r1, read_two_five_prime);
}
if is_r1_first && read_one_three_prime > 0 {
clipper.clip_3_prime_end_of_alignment(r1, read_one_three_prime);
} else if !is_r1_first && read_two_three_prime > 0 {
clipper.clip_3_prime_end_of_alignment(r1, read_two_three_prime);
}
if is_r2_last && read_two_five_prime > 0 {
clipper.clip_5_prime_end_of_alignment(r2, read_two_five_prime);
} else if !is_r2_last && read_one_five_prime > 0 {
clipper.clip_5_prime_end_of_alignment(r2, read_one_five_prime);
}
if is_r2_last && read_two_three_prime > 0 {
clipper.clip_3_prime_end_of_alignment(r2, read_two_three_prime);
} else if !is_r2_last && read_one_three_prime > 0 {
clipper.clip_3_prime_end_of_alignment(r2, read_one_three_prime);
}
if clip_overlapping_reads {
let (num_r1, num_r2) = clipper.clip_overlapping_reads(r1, r2);
if num_r1 > 0 || num_r2 > 0 {
overlap_clipped_count += 1;
}
}
if clip_extending_past_mate {
let (num_r1, num_r2) = clipper.clip_extending_past_mate_ends(r1, r2);
if num_r1 > 0 || num_r2 > 0 {
extend_clipped_count += 1;
}
}
update_mate_info_raw(r1, r2);
update_mate_info_raw(r2, r1);
}
for record in &mut records {
regenerate_alignment_tags_raw(
record.as_mut_vec(),
&header_for_process,
&reference_for_process,
)
.map_err(io::Error::other)?;
}
clipped_records.extend(records);
}
let records_count = clipped_records.len() as u64;
let count = progress_for_process.fetch_add(records_count, Ordering::Relaxed);
if (count + records_count) / 1_000_000 > count / 1_000_000 {
info!("Processed {} records", count + records_count);
}
Ok(ClipProcessedBatch {
clipped_records,
templates_count,
overlap_clipped_count,
extend_clipped_count,
})
};
let serialize_fn = move |processed: ClipProcessedBatch,
_header: &Header,
output: &mut Vec<u8>|
-> io::Result<u64> {
collected_for_serialize.with_slot(|m| {
m.total_templates += processed.templates_count;
m.overlap_clipped += processed.overlap_clipped_count;
m.extend_clipped += processed.extend_clipped_count;
});
let count = processed.clipped_records.len() as u64;
fgumi_raw_bam::write_raw_records(output, &processed.clipped_records)?;
Ok(count)
};
let records_written = run_bam_pipeline_from_reader(
pipeline_config,
reader,
header.clone(),
&self.io.output,
None, grouper_fn,
process_fn,
serialize_fn,
)?;
let mut total_templates = 0u64;
let mut total_overlap_clipped = 0u64;
let mut total_extend_clipped = 0u64;
for slot in collected_metrics.slots() {
let m = slot.lock();
total_templates += m.total_templates;
total_overlap_clipped += m.overlap_clipped;
total_extend_clipped += m.extend_clipped;
}
info!("Total templates processed: {total_templates}");
info!("Templates with overlap clipping: {total_overlap_clipped}");
info!("Templates with mate extension clipping: {total_extend_clipped}");
info!("Done!");
Ok(records_written)
}
}
fn update_mate_info_raw(record: &mut RawRecord, mate: &RawRecord) {
let cigar_str = mate.cigar_to_string();
let mut editor = record.tags_editor();
editor.update_string(b"MC", cigar_str.as_bytes());
let mapq = mate.mapq();
editor.update_int(b"MQ", i32::from(mapq));
}
fn clipped_bases_raw(record: &RawRecord) -> usize {
record
.cigar_ops_typed()
.filter(|op| {
matches!(
op.kind(),
fgumi_raw_bam::CigarKind::SoftClip | fgumi_raw_bam::CigarKind::HardClip
)
})
.map(|op| op.len() as usize)
.sum()
}
#[cfg(test)]
mod tests {
use super::*;
use rstest::rstest;
use std::path::PathBuf;
#[test]
fn test_default_clip_parameters() {
let clip = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: false,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert_eq!(clip.clipping_mode, ClippingMode::Hard);
assert!(!clip.clip_overlapping_reads);
assert!(!clip.clip_extending_past_mate);
}
#[test]
fn test_clip_with_fixed_positions() {
let clip = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: false,
clip_extending_past_mate: false,
read_one_five_prime: 5,
read_one_three_prime: 3,
read_two_five_prime: 7,
read_two_three_prime: 2,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert_eq!(clip.read_one_five_prime, 5);
assert_eq!(clip.read_one_three_prime, 3);
assert_eq!(clip.read_two_five_prime, 7);
assert_eq!(clip.read_two_three_prime, 2);
}
#[test]
fn test_clip_with_overlapping_enabled() {
let clip = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: true,
clip_extending_past_mate: true,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert_eq!(clip.clipping_mode, ClippingMode::Hard);
assert!(clip.clip_overlapping_reads);
assert!(clip.clip_extending_past_mate);
}
#[test]
fn test_clip_with_metrics_output() {
let clip = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::SoftWithMask,
clip_overlapping_reads: false,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: true,
auto_clip_attributes: false,
metrics: Some(PathBuf::from("metrics.txt")),
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert_eq!(clip.clipping_mode, ClippingMode::SoftWithMask);
assert!(clip.upgrade_clipping);
assert_eq!(clip.metrics, Some(PathBuf::from("metrics.txt")));
}
#[test]
fn test_clip_with_tag_regeneration() {
let clip = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: false,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: true,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert_eq!(clip.reference, PathBuf::from("reference.fa"));
assert!(clip.auto_clip_attributes);
}
#[test]
fn test_clip_all_modes_enabled() {
let clip = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: true,
clip_extending_past_mate: true,
read_one_five_prime: 5,
read_one_three_prime: 5,
read_two_five_prime: 5,
read_two_three_prime: 5,
upgrade_clipping: true,
auto_clip_attributes: true,
metrics: Some(PathBuf::from("metrics.txt")),
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert!(clip.clip_overlapping_reads);
assert!(clip.clip_extending_past_mate);
assert!(clip.upgrade_clipping);
assert!(clip.auto_clip_attributes);
assert!(clip.read_one_five_prime > 0);
}
#[test]
fn test_clipping_mode_enum_values() {
let soft = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Soft,
clip_overlapping_reads: true,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert_eq!(soft.clipping_mode, ClippingMode::Soft);
}
#[test]
fn test_clip_with_sort_order_specification() {
let clip = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: false,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: Some("coordinate".to_string()),
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert_eq!(clip.sort_order, Some("coordinate".to_string()));
}
#[test]
fn test_clip_asymmetric_fixed_positions() {
let clip = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Soft,
clip_overlapping_reads: false,
clip_extending_past_mate: false,
read_one_five_prime: 10,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 15,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert_eq!(clip.read_one_five_prime, 10);
assert_eq!(clip.read_one_three_prime, 0);
assert_eq!(clip.read_two_five_prime, 0);
assert_eq!(clip.read_two_three_prime, 15);
}
#[test]
fn test_clip_with_upgrade_all_clipping() {
let clip = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: false,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: true,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert!(clip.upgrade_clipping);
assert_eq!(clip.clipping_mode, ClippingMode::Hard);
}
#[test]
fn test_clip_extending_past_mate_only() {
let clip = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: false,
clip_extending_past_mate: true,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert!(!clip.clip_overlapping_reads);
assert!(clip.clip_extending_past_mate);
}
#[test]
fn test_clip_overlapping_reads_only() {
let clip = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: true,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert!(clip.clip_overlapping_reads);
assert!(!clip.clip_extending_past_mate);
}
#[test]
fn test_clip_modes_with_auto_clip_attributes() {
let clip = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: true,
clip_extending_past_mate: true,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: true,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert!(clip.auto_clip_attributes);
assert_eq!(clip.clipping_mode, ClippingMode::Hard);
}
#[test]
fn test_clip_zero_bases_all_positions() {
let clip = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: false,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert_eq!(clip.read_one_five_prime, 0);
assert_eq!(clip.read_one_three_prime, 0);
assert_eq!(clip.read_two_five_prime, 0);
assert_eq!(clip.read_two_three_prime, 0);
}
#[test]
fn test_clip_soft_with_mask_mode() {
let clip = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::SoftWithMask,
clip_overlapping_reads: true,
clip_extending_past_mate: false,
read_one_five_prime: 5,
read_one_three_prime: 5,
read_two_five_prime: 5,
read_two_three_prime: 5,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert_eq!(clip.clipping_mode, ClippingMode::SoftWithMask);
assert!(clip.clip_overlapping_reads);
assert_eq!(clip.read_one_five_prime, 5);
}
#[test]
fn test_clip_large_fixed_positions() {
let clip = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: false,
clip_extending_past_mate: false,
read_one_five_prime: 50,
read_one_three_prime: 50,
read_two_five_prime: 50,
read_two_three_prime: 50,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert_eq!(clip.read_one_five_prime, 50);
assert_eq!(clip.read_one_three_prime, 50);
assert_eq!(clip.read_two_five_prime, 50);
assert_eq!(clip.read_two_three_prime, 50);
}
#[test]
fn test_clip_combination_overlapping_and_fixed() {
let clip = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: true,
clip_extending_past_mate: false,
read_one_five_prime: 10,
read_one_three_prime: 10,
read_two_five_prime: 10,
read_two_three_prime: 10,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert!(clip.clip_overlapping_reads);
assert!(clip.read_one_five_prime > 0);
assert!(clip.read_one_three_prime > 0);
}
#[test]
fn test_clip_all_three_modes_comparison() {
let soft = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Soft,
clip_overlapping_reads: false,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
let soft_mask = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::SoftWithMask,
clip_overlapping_reads: false,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
let hard = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: false,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert_eq!(soft.clipping_mode, ClippingMode::Soft);
assert_eq!(soft_mask.clipping_mode, ClippingMode::SoftWithMask);
assert_eq!(hard.clipping_mode, ClippingMode::Hard);
}
#[test]
fn test_clip_with_queryname_sort_order() {
let clip = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: true,
clip_extending_past_mate: true,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: Some("queryname".to_string()),
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert_eq!(clip.sort_order, Some("queryname".to_string()));
}
#[test]
fn test_clip_with_unsorted_sort_order() {
let clip = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: true,
clip_extending_past_mate: true,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: Some("unsorted".to_string()),
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert_eq!(clip.sort_order, Some("unsorted".to_string()));
}
#[test]
fn test_clip_single_read_end_clipping() {
let clip = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: false,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 20,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert_eq!(clip.read_one_five_prime, 0);
assert_eq!(clip.read_one_three_prime, 0);
assert_eq!(clip.read_two_five_prime, 0);
assert_eq!(clip.read_two_three_prime, 20);
}
#[test]
fn test_clip_with_metrics_and_upgrade() {
let clip = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: true,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: true,
auto_clip_attributes: false,
metrics: Some(PathBuf::from("metrics.txt")),
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert!(clip.clip_overlapping_reads);
assert!(clip.upgrade_clipping);
assert!(clip.metrics.is_some());
}
#[test]
fn test_clip_both_extending_and_overlapping() {
let clip = Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Soft,
clip_overlapping_reads: true,
clip_extending_past_mate: true,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
assert!(clip.clip_overlapping_reads);
assert!(clip.clip_extending_past_mate);
}
use crate::sam::SamBuilder;
use anyhow::Result;
use tempfile::TempDir;
fn create_test_reference(dir: &TempDir) -> PathBuf {
let ref_path = dir.path().join("ref.fa");
let ref_content = ">chr1\nACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT\n";
std::fs::write(&ref_path, ref_content).expect("failed to write reference FASTA");
let fai_content = "chr1\t200\t6\t200\t201\n";
std::fs::write(dir.path().join("ref.fa.fai"), fai_content)
.expect("failed to write FASTA index");
ref_path
}
fn read_bam_records(path: &std::path::Path) -> Result<Vec<noodles::sam::alignment::RecordBuf>> {
let mut reader = noodles::bam::io::reader::Builder.build_from_path(path)?;
let header = reader.read_header()?;
let records: Vec<_> = reader.record_bufs(&header).collect::<std::io::Result<Vec<_>>>()?;
Ok(records)
}
#[test]
fn test_clip_execute_basic_overlapping() -> Result<()> {
let dir = TempDir::new()?;
let ref_path = create_test_reference(&dir);
let input_path = dir.path().join("input.bam");
let output_path = dir.path().join("output.bam");
let mut builder = SamBuilder::with_single_ref("chr1", 200);
let _ = builder
.add_pair()
.name("read1")
.bases1("ACGTACGTACGTACGTACGT") .contig(0)
.start1(10) .start2(20) .build();
builder.write(&input_path)?;
let clip = Clip {
io: BamIoOptions {
input: input_path,
output: output_path.clone(),
async_reader: false,
},
reference: ref_path,
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: true,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
clip.execute("test")?;
let output_records = read_bam_records(&output_path)?;
assert_eq!(output_records.len(), 2);
Ok(())
}
#[test]
fn test_clip_execute_soft_mode() -> Result<()> {
let dir = TempDir::new()?;
let ref_path = create_test_reference(&dir);
let input_path = dir.path().join("input.bam");
let output_path = dir.path().join("output.bam");
let mut builder = SamBuilder::with_single_ref("chr1", 200);
let _ = builder
.add_pair()
.name("read1")
.bases1("ACGTACGTACGTACGTACGT")
.contig(0)
.start1(10)
.start2(20)
.build();
builder.write(&input_path)?;
let clip = Clip {
io: BamIoOptions {
input: input_path,
output: output_path.clone(),
async_reader: false,
},
reference: ref_path,
clipping_mode: ClippingMode::Soft,
clip_overlapping_reads: true,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
clip.execute("test")?;
assert!(output_path.exists());
Ok(())
}
#[test]
fn test_clip_execute_soft_with_mask_mode() -> Result<()> {
let dir = TempDir::new()?;
let ref_path = create_test_reference(&dir);
let input_path = dir.path().join("input.bam");
let output_path = dir.path().join("output.bam");
let mut builder = SamBuilder::with_single_ref("chr1", 200);
let _ = builder
.add_pair()
.name("read1")
.bases1("ACGTACGTACGTACGTACGT")
.contig(0)
.start1(10)
.start2(20)
.build();
builder.write(&input_path)?;
let clip = Clip {
io: BamIoOptions {
input: input_path,
output: output_path.clone(),
async_reader: false,
},
reference: ref_path,
clipping_mode: ClippingMode::SoftWithMask,
clip_overlapping_reads: true,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
clip.execute("test")?;
assert!(output_path.exists());
Ok(())
}
#[test]
fn test_clip_execute_with_fixed_positions() -> Result<()> {
let dir = TempDir::new()?;
let ref_path = create_test_reference(&dir);
let input_path = dir.path().join("input.bam");
let output_path = dir.path().join("output.bam");
let mut builder = SamBuilder::with_single_ref("chr1", 200);
let _ = builder
.add_pair()
.name("read1")
.bases1("ACGTACGTACGTACGTACGT")
.contig(0)
.start1(10)
.start2(30)
.build();
builder.write(&input_path)?;
let clip = Clip {
io: BamIoOptions {
input: input_path,
output: output_path.clone(),
async_reader: false,
},
reference: ref_path,
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: false,
clip_extending_past_mate: false,
read_one_five_prime: 3,
read_one_three_prime: 2,
read_two_five_prime: 2,
read_two_three_prime: 3,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
clip.execute("test")?;
assert!(output_path.exists());
Ok(())
}
#[test]
fn test_clip_execute_with_extending_past_mate() -> Result<()> {
let dir = TempDir::new()?;
let ref_path = create_test_reference(&dir);
let input_path = dir.path().join("input.bam");
let output_path = dir.path().join("output.bam");
let mut builder = SamBuilder::with_single_ref("chr1", 200);
let _ = builder
.add_pair()
.name("read1")
.bases1("ACGTACGTACGTACGTACGT")
.contig(0)
.start1(10)
.start2(20)
.build();
builder.write(&input_path)?;
let clip = Clip {
io: BamIoOptions {
input: input_path,
output: output_path.clone(),
async_reader: false,
},
reference: ref_path,
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: false,
clip_extending_past_mate: true,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
clip.execute("test")?;
assert!(output_path.exists());
Ok(())
}
#[test]
fn test_clip_execute_with_upgrade_clipping() -> Result<()> {
let dir = TempDir::new()?;
let ref_path = create_test_reference(&dir);
let input_path = dir.path().join("input.bam");
let output_path = dir.path().join("output.bam");
let mut builder = SamBuilder::with_single_ref("chr1", 200);
let _ = builder
.add_pair()
.name("read1")
.bases1("ACGTACGTACGTACGTACGT")
.contig(0)
.start1(10)
.start2(30)
.build();
builder.write(&input_path)?;
let clip = Clip {
io: BamIoOptions {
input: input_path,
output: output_path.clone(),
async_reader: false,
},
reference: ref_path,
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: false,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: true,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
clip.execute("test")?;
assert!(output_path.exists());
Ok(())
}
#[test]
fn test_clip_execute_with_metrics() -> Result<()> {
let dir = TempDir::new()?;
let ref_path = create_test_reference(&dir);
let input_path = dir.path().join("input.bam");
let output_path = dir.path().join("output.bam");
let metrics_path = dir.path().join("metrics.txt");
let mut builder = SamBuilder::with_single_ref("chr1", 200);
let _ = builder
.add_pair()
.name("read1")
.bases1("ACGTACGTACGTACGTACGT")
.contig(0)
.start1(10)
.start2(20)
.build();
builder.write(&input_path)?;
let clip = Clip {
io: BamIoOptions {
input: input_path,
output: output_path.clone(),
async_reader: false,
},
reference: ref_path,
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: true,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: Some(metrics_path.clone()),
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
clip.execute("test")?;
assert!(output_path.exists());
assert!(metrics_path.exists());
Ok(())
}
#[test]
fn test_clip_execute_with_sort_order() -> Result<()> {
let dir = TempDir::new()?;
let ref_path = create_test_reference(&dir);
let input_path = dir.path().join("input.bam");
let output_path = dir.path().join("output.bam");
let mut builder = SamBuilder::with_single_ref("chr1", 200);
let _ = builder
.add_pair()
.name("read1")
.bases1("ACGTACGTACGTACGTACGT")
.contig(0)
.start1(10)
.start2(30)
.build();
builder.write(&input_path)?;
let clip = Clip {
io: BamIoOptions {
input: input_path,
output: output_path.clone(),
async_reader: false,
},
reference: ref_path,
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: true,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: Some("queryname".to_string()),
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
clip.execute("test")?;
assert!(output_path.exists());
Ok(())
}
#[test]
fn test_clip_execute_with_fragment() -> Result<()> {
let dir = TempDir::new()?;
let ref_path = create_test_reference(&dir);
let input_path = dir.path().join("input.bam");
let output_path = dir.path().join("output.bam");
let mut builder = SamBuilder::with_single_ref("chr1", 200);
let _ = builder
.add_frag()
.name("frag1")
.bases("ACGTACGTACGTACGTACGT")
.contig(0)
.start(10)
.build();
builder.write(&input_path)?;
let clip = Clip {
io: BamIoOptions {
input: input_path,
output: output_path.clone(),
async_reader: false,
},
reference: ref_path,
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: false,
clip_extending_past_mate: false,
read_one_five_prime: 3,
read_one_three_prime: 2,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
clip.execute("test")?;
let output_records = read_bam_records(&output_path)?;
assert_eq!(output_records.len(), 1);
Ok(())
}
#[test]
fn test_clip_execute_with_auto_clip_attributes() -> Result<()> {
let dir = TempDir::new()?;
let ref_path = create_test_reference(&dir);
let input_path = dir.path().join("input.bam");
let output_path = dir.path().join("output.bam");
let mut builder = SamBuilder::with_single_ref("chr1", 200);
let _ = builder
.add_pair()
.name("read1")
.bases1("ACGTACGTACGTACGTACGT")
.contig(0)
.start1(10)
.start2(30)
.build();
builder.write(&input_path)?;
let clip = Clip {
io: BamIoOptions {
input: input_path,
output: output_path.clone(),
async_reader: false,
},
reference: ref_path,
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: true,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: true,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
clip.execute("test")?;
assert!(output_path.exists());
Ok(())
}
#[test]
fn test_clip_execute_all_clipping_options() -> Result<()> {
let dir = TempDir::new()?;
let ref_path = create_test_reference(&dir);
let input_path = dir.path().join("input.bam");
let output_path = dir.path().join("output.bam");
let metrics_path = dir.path().join("metrics.txt");
let mut builder = SamBuilder::with_single_ref("chr1", 200);
let _ = builder
.add_pair()
.name("read1")
.bases1("ACGTACGTACGTACGTACGT")
.contig(0)
.start1(10)
.start2(20)
.build();
builder.write(&input_path)?;
let clip = Clip {
io: BamIoOptions {
input: input_path,
output: output_path.clone(),
async_reader: false,
},
reference: ref_path,
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: true,
clip_extending_past_mate: true,
read_one_five_prime: 2,
read_one_three_prime: 2,
read_two_five_prime: 2,
read_two_three_prime: 2,
upgrade_clipping: true,
auto_clip_attributes: true,
metrics: Some(metrics_path.clone()),
sort_order: Some("queryname".to_string()),
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
clip.execute("test")?;
assert!(output_path.exists());
assert!(metrics_path.exists());
Ok(())
}
#[test]
fn test_clip_execute_no_clipping_option() {
let dir = TempDir::new().expect("failed to create temp dir");
let ref_path = create_test_reference(&dir);
let input_path = dir.path().join("input.bam");
let output_path = dir.path().join("output.bam");
let mut builder = SamBuilder::with_single_ref("chr1", 200);
let _ = builder
.add_pair()
.name("read1")
.bases1("ACGTACGTACGTACGTACGT")
.contig(0)
.start1(10)
.start2(30)
.build();
builder.write(&input_path).expect("failed to write test BAM");
let clip = Clip {
io: BamIoOptions { input: input_path, output: output_path, async_reader: false },
reference: ref_path,
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: false,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false, auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
let result = clip.execute("test");
assert!(result.is_err());
assert!(result.unwrap_err().to_string().contains("At least one clipping option"));
}
#[rstest]
#[case::fast_path(ThreadingOptions::none())]
#[case::pipeline_1(ThreadingOptions::new(1))]
#[case::pipeline_2(ThreadingOptions::new(2))]
fn test_threading_modes(#[case] threading: ThreadingOptions) -> Result<()> {
let dir = TempDir::new()?;
let ref_path = create_test_reference(&dir);
let input_path = dir.path().join("input.bam");
let output_path = dir.path().join("output.bam");
let mut builder = SamBuilder::with_single_ref("chr1", 200);
let _ = builder
.add_pair()
.name("read1")
.bases1("ACGTACGTACGTACGTACGT")
.contig(0)
.start1(10)
.start2(20)
.build();
builder.write(&input_path)?;
let clip = Clip {
io: BamIoOptions {
input: input_path,
output: output_path.clone(),
async_reader: false,
},
reference: ref_path,
clipping_mode: ClippingMode::Hard,
clip_overlapping_reads: true,
clip_extending_past_mate: false,
read_one_five_prime: 0,
read_one_three_prime: 0,
read_two_five_prime: 0,
read_two_three_prime: 0,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading,
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
};
clip.execute("test")?;
let output_records = read_bam_records(&output_path)?;
assert_eq!(output_records.len(), 2, "Should have 2 records");
Ok(())
}
#[test]
fn test_clip_processed_batch_memory_estimate() {
let record =
fgumi_raw_bam::SamBuilder::new().sequence(b"ACGT").qualities(&[30, 30, 30, 30]).build();
let mut records = Vec::with_capacity(10);
records.push(record);
let batch = ClipProcessedBatch {
clipped_records: records,
templates_count: 1,
overlap_clipped_count: 0,
extend_clipped_count: 0,
};
let estimate = batch.estimate_heap_size();
let vec_overhead = 10 * std::mem::size_of::<RawRecord>();
assert!(
estimate >= vec_overhead,
"estimate {estimate} should include Vec<RawRecord> overhead {vec_overhead}"
);
}
fn make_clip(
read_one_five_prime: usize,
read_one_three_prime: usize,
read_two_five_prime: usize,
read_two_three_prime: usize,
) -> Clip {
Clip {
io: BamIoOptions {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output.bam"),
async_reader: false,
},
reference: PathBuf::from("reference.fa"),
clipping_mode: ClippingMode::Soft,
clip_overlapping_reads: false,
clip_extending_past_mate: false,
read_one_five_prime,
read_one_three_prime,
read_two_five_prime,
read_two_three_prime,
upgrade_clipping: false,
auto_clip_attributes: false,
metrics: None,
sort_order: None,
threading: ThreadingOptions::none(),
compression: CompressionOptions { compression_level: 1 },
scheduler_opts: SchedulerOptions::default(),
queue_memory: QueueMemoryOptions::default(),
}
}
#[test]
fn test_clip_fragment_with_metrics() {
use crate::metrics::clip::ClippingMetricsCollection;
let clip = make_clip(3, 2, 0, 0);
let clipper = RawRecordClipper::new(ClippingMode::Soft);
let mut metrics = ClippingMetricsCollection::new();
let mut record = fgumi_raw_bam::SamBuilder::new()
.sequence(b"ACGTACGTACGTACGTACGT")
.qualities(&[30; 20])
.cigar_ops(&[20u32 << 4]) .ref_id(0)
.pos(99) .mapq(60)
.build();
clip.clip_fragment(&clipper, &mut record, Some(&mut metrics))
.expect("clip_fragment should succeed");
assert_eq!(metrics.fragment.reads, 1);
assert_eq!(metrics.fragment.bases_clipped_five_prime, 3);
assert_eq!(metrics.fragment.bases_clipped_three_prime, 2);
assert_eq!(metrics.fragment.bases, 15);
}
#[test]
fn test_clip_fragment_no_clipping_with_metrics() {
use crate::metrics::clip::ClippingMetricsCollection;
let clip = make_clip(0, 0, 0, 0);
let clipper = RawRecordClipper::new(ClippingMode::Soft);
let mut metrics = ClippingMetricsCollection::new();
let mut record = fgumi_raw_bam::SamBuilder::new()
.sequence(b"ACGTACGTACGT")
.qualities(&[30; 12])
.cigar_ops(&[12u32 << 4]) .ref_id(0)
.pos(99) .mapq(60)
.build();
clip.clip_fragment(&clipper, &mut record, Some(&mut metrics))
.expect("clip_fragment should succeed");
assert_eq!(metrics.fragment.reads, 1);
assert_eq!(metrics.fragment.bases, 12);
assert_eq!(metrics.fragment.bases_clipped_five_prime, 0);
assert_eq!(metrics.fragment.bases_clipped_three_prime, 0);
}
#[test]
fn test_clip_pair_with_metrics() {
use crate::metrics::clip::ClippingMetricsCollection;
use fgumi_raw_bam::flags as raw_flags;
let clip = make_clip(2, 1, 1, 2);
let clipper = RawRecordClipper::new(ClippingMode::Soft);
let mut metrics = ClippingMetricsCollection::new();
let mut r1 = fgumi_raw_bam::SamBuilder::new()
.read_name(b"pair1")
.sequence(b"ACGTACGTACGTACGTACGT")
.qualities(&[30; 20])
.cigar_ops(&[20u32 << 4]) .flags(raw_flags::PAIRED | raw_flags::FIRST_SEGMENT)
.ref_id(0)
.pos(99) .mapq(60)
.mate_ref_id(0)
.mate_pos(199) .template_length(120)
.build();
let mut r2 = fgumi_raw_bam::SamBuilder::new()
.read_name(b"pair1")
.sequence(b"ACGTACGTACGTACGTACGT")
.qualities(&[30; 20])
.cigar_ops(&[20u32 << 4]) .flags(raw_flags::PAIRED | raw_flags::LAST_SEGMENT | raw_flags::REVERSE)
.ref_id(0)
.pos(199) .mapq(60)
.mate_ref_id(0)
.mate_pos(99) .template_length(-120)
.build();
let (overlap, extend) = clip
.clip_pair(&clipper, &mut r1, &mut r2, Some(&mut metrics))
.expect("clip_pair should succeed");
assert!(!overlap, "no overlapping clipping expected");
assert!(!extend, "no extending clipping expected");
assert_eq!(metrics.read_one.reads, 1);
assert_eq!(metrics.read_one.bases_clipped_five_prime, 2);
assert_eq!(metrics.read_one.bases_clipped_three_prime, 1);
assert_eq!(metrics.read_two.reads, 1);
assert_eq!(metrics.read_two.bases_clipped_five_prime, 1);
assert_eq!(metrics.read_two.bases_clipped_three_prime, 2);
}
#[test]
fn test_clip_pair_with_metrics_swapped_flags() {
use crate::metrics::clip::ClippingMetricsCollection;
use fgumi_raw_bam::flags as raw_flags;
let clip = make_clip(2, 1, 1, 2);
let clipper = RawRecordClipper::new(ClippingMode::Soft);
let mut metrics = ClippingMetricsCollection::new();
let mut r1 = fgumi_raw_bam::SamBuilder::new()
.read_name(b"pair1")
.sequence(b"ACGTACGTACGTACGTACGT")
.qualities(&[30; 20])
.cigar_ops(&[20u32 << 4]) .flags(raw_flags::PAIRED | raw_flags::LAST_SEGMENT)
.ref_id(0)
.pos(99) .mapq(60)
.mate_ref_id(0)
.mate_pos(199) .template_length(120)
.build();
let mut r2 = fgumi_raw_bam::SamBuilder::new()
.read_name(b"pair1")
.sequence(b"ACGTACGTACGTACGTACGT")
.qualities(&[30; 20])
.cigar_ops(&[20u32 << 4]) .flags(raw_flags::PAIRED | raw_flags::FIRST_SEGMENT | raw_flags::REVERSE)
.ref_id(0)
.pos(199) .mapq(60)
.mate_ref_id(0)
.mate_pos(99) .template_length(-120)
.build();
let (overlap, extend) = clip
.clip_pair(&clipper, &mut r1, &mut r2, Some(&mut metrics))
.expect("clip_pair should succeed");
assert!(!overlap);
assert!(!extend);
assert_eq!(metrics.read_two.reads, 1);
assert_eq!(metrics.read_two.bases_clipped_five_prime, 1);
assert_eq!(metrics.read_two.bases_clipped_three_prime, 2);
assert_eq!(metrics.read_one.reads, 1);
assert_eq!(metrics.read_one.bases_clipped_five_prime, 2);
assert_eq!(metrics.read_one.bases_clipped_three_prime, 1);
}
}