use fgumi_raw_bam::{RawRecord, SamBuilder, flags};
use noodles::bam;
use noodles::sam::alignment::io::Write as AlignmentWrite;
use std::fs;
use std::path::PathBuf;
use std::process::Command;
use tempfile::TempDir;
use crate::helpers::bam_generator::{create_minimal_header, to_record_buf};
fn create_sorted_bam(path: &PathBuf, records: Vec<RawRecord>) {
let header = create_minimal_header("chr1", 10000);
let mut writer =
bam::io::Writer::new(fs::File::create(path).expect("Failed to create BAM file"));
writer.write_header(&header).expect("Failed to write header");
for record in records {
writer
.write_alignment_record(&header, &to_record_buf(&record))
.expect("Failed to write record");
}
writer.try_finish().expect("Failed to finish BAM");
}
fn create_duplicate_group(base_name: &str, umi: &str, count: usize, start: i32) -> Vec<RawRecord> {
let mut records = Vec::new();
for i in 0..count {
let name = format!("{base_name}_{i}");
let r1 = {
let mut b = SamBuilder::new();
b.read_name(name.as_bytes())
.sequence(b"ACGTACGT")
.qualities(&[30; 8])
.flags(flags::PAIRED | flags::FIRST_SEGMENT)
.ref_id(0)
.pos(start - 1)
.mapq(60)
.cigar_ops(&[8 << 4]) .mate_ref_id(0)
.mate_pos(start + 99)
.template_length(108)
.add_string_tag(b"RX", umi.as_bytes())
.add_string_tag(b"MC", b"8M");
b.build()
};
let r2 = {
let mut b = SamBuilder::new();
b.read_name(name.as_bytes())
.sequence(b"ACGTACGT")
.qualities(&[30; 8])
.flags(flags::PAIRED | flags::LAST_SEGMENT | flags::REVERSE)
.ref_id(0)
.pos(start + 99)
.mapq(60)
.cigar_ops(&[8 << 4]) .mate_ref_id(0)
.mate_pos(start - 1)
.template_length(-108)
.add_string_tag(b"RX", umi.as_bytes())
.add_string_tag(b"MC", b"8M");
b.build()
};
records.push(r1);
records.push(r2);
}
records
}
#[test]
fn test_dedup_command_basic() {
let temp_dir = TempDir::new().unwrap();
let input_bam = temp_dir.path().join("input.bam");
let output_bam = temp_dir.path().join("output.bam");
let mut records = create_duplicate_group("dup1", "ACGTACGT", 3, 100);
records.extend(create_duplicate_group("dup2", "TGCATGCA", 2, 500));
create_sorted_bam(&input_bam, records);
let status = Command::new(env!("CARGO_BIN_EXE_fgumi"))
.args([
"dedup",
"--input",
input_bam.to_str().unwrap(),
"--output",
output_bam.to_str().unwrap(),
"--strategy",
"identity",
"--compression-level",
"1",
])
.status()
.expect("Failed to run dedup command");
assert!(status.success(), "Dedup command failed");
assert!(output_bam.exists(), "Output BAM not created");
let mut reader = bam::io::Reader::new(fs::File::open(&output_bam).unwrap());
let _header = reader.read_header().unwrap();
let count = reader.records().count();
assert_eq!(count, 10, "All reads should be in output (marked, not removed)");
}
#[test]
fn test_dedup_command_with_metrics() {
let temp_dir = TempDir::new().unwrap();
let input_bam = temp_dir.path().join("input.bam");
let output_bam = temp_dir.path().join("output.bam");
let metrics_path = temp_dir.path().join("metrics.txt");
let records = create_duplicate_group("dup1", "ACGTACGT", 3, 100);
create_sorted_bam(&input_bam, records);
let status = Command::new(env!("CARGO_BIN_EXE_fgumi"))
.args([
"dedup",
"--input",
input_bam.to_str().unwrap(),
"--output",
output_bam.to_str().unwrap(),
"--strategy",
"identity",
"--metrics",
metrics_path.to_str().unwrap(),
"--compression-level",
"1",
])
.status()
.expect("Failed to run dedup command");
assert!(status.success(), "Dedup command with metrics failed");
assert!(metrics_path.exists(), "Metrics file not created");
}
#[test]
fn test_dedup_command_remove_duplicates() {
let temp_dir = TempDir::new().unwrap();
let input_bam = temp_dir.path().join("input.bam");
let output_bam = temp_dir.path().join("output.bam");
let records = create_duplicate_group("dup1", "ACGTACGT", 3, 100);
create_sorted_bam(&input_bam, records);
let status = Command::new(env!("CARGO_BIN_EXE_fgumi"))
.args([
"dedup",
"--input",
input_bam.to_str().unwrap(),
"--output",
output_bam.to_str().unwrap(),
"--strategy",
"identity",
"--remove-duplicates",
"--compression-level",
"1",
])
.status()
.expect("Failed to run dedup command");
assert!(status.success(), "Dedup command with --remove-duplicates failed");
assert!(output_bam.exists(), "Output BAM not created");
let mut reader = bam::io::Reader::new(fs::File::open(&output_bam).unwrap());
let _header = reader.read_header().unwrap();
let count = reader.records().count();
assert!(count < 6, "Remove-duplicates should produce fewer reads than input");
assert!(count >= 2, "Should keep at least one pair");
}
#[test]
#[allow(clippy::too_many_lines)]
fn test_dedup_no_umi_large_position_group() {
use bstr::BString;
use clap::Parser;
use fgumi_lib::commands::command::Command as FgumiCommand;
use fgumi_lib::commands::dedup::MarkDuplicates;
use fgumi_lib::sam::SamTag;
use fgumi_raw_bam::raw_record_to_record_buf;
use fgumi_raw_bam::{SamBuilder as RawSamBuilder, flags, testutil::encode_op};
use noodles::sam::Header;
use noodles::sam::alignment::record::data::field::Tag;
use noodles::sam::alignment::record_buf::RecordBuf;
use noodles::sam::alignment::record_buf::data::field::value::Value as DataValue;
use noodles::sam::header::record::value::map::Map as HeaderRecordMap;
use noodles::sam::header::record::value::map::header::tag::Tag as HeaderTag;
use noodles::sam::header::record::value::{
Map, map::Header as HeaderRecord, map::ReferenceSequence,
};
use std::collections::HashSet;
use std::num::NonZeroUsize;
const NUM_TEMPLATES: usize = 5_000;
const PROPERLY_PAIRED: u16 = 0x2;
const MATE_REVERSE: u16 = 0x20;
let temp_dir = TempDir::new().unwrap();
let input_bam = temp_dir.path().join("input.bam");
let output_bam = temp_dir.path().join("output.bam");
let header = {
let mut builder = HeaderRecordMap::<HeaderRecord>::builder();
for (tag_bytes, value) in
[(*b"SO", "unsorted"), (*b"GO", "query"), (*b"SS", "template-coordinate")]
{
let HeaderTag::Other(tag) = HeaderTag::from(tag_bytes) else { unreachable!() };
builder = builder.insert(tag, value);
}
Header::builder()
.set_header(builder.build().expect("valid header map"))
.add_reference_sequence(
BString::from("chr1"),
Map::<ReferenceSequence>::new(NonZeroUsize::new(10_000).expect("non-zero length")),
)
.build()
};
{
let mut writer = bam::io::Writer::new(fs::File::create(&input_bam).expect("create BAM"));
writer.write_header(&header).expect("write header");
for i in 0..NUM_TEMPLATES {
let name = format!("read_{i:05}");
let r1 = {
let mut b = RawSamBuilder::new();
b.read_name(name.as_bytes())
.sequence(b"ACGTACGT")
.qualities(&[30; 8])
.flags(flags::PAIRED | PROPERLY_PAIRED | flags::FIRST_SEGMENT | MATE_REVERSE)
.ref_id(0)
.pos(99)
.mapq(60)
.cigar_ops(&[encode_op(0, 8)])
.mate_ref_id(0)
.mate_pos(199)
.template_length(108);
b.add_string_tag(b"MC", b"8M");
b.build()
};
let r2 = {
let mut b = RawSamBuilder::new();
b.read_name(name.as_bytes())
.sequence(b"ACGTACGT")
.qualities(&[30; 8])
.flags(flags::PAIRED | PROPERLY_PAIRED | flags::REVERSE | flags::LAST_SEGMENT)
.ref_id(0)
.pos(199)
.mapq(60)
.cigar_ops(&[encode_op(0, 8)])
.mate_ref_id(0)
.mate_pos(99)
.template_length(-108);
b.add_string_tag(b"MC", b"8M");
b.build()
};
let r1_buf = raw_record_to_record_buf(&r1, &header).expect("convert R1");
let r2_buf = raw_record_to_record_buf(&r2, &header).expect("convert R2");
writer.write_alignment_record(&header, &r1_buf).expect("write R1");
writer.write_alignment_record(&header, &r2_buf).expect("write R2");
}
writer.try_finish().expect("finish BAM");
}
let cmd = MarkDuplicates::try_parse_from([
"dedup",
"--input",
input_bam.to_str().unwrap(),
"--output",
output_bam.to_str().unwrap(),
"--no-umi",
"--compression-level",
"1",
])
.expect("failed to parse dedup args");
cmd.execute("test").expect("dedup --no-umi should succeed with large position group");
assert!(output_bam.exists(), "output BAM should be created");
let mut reader = bam::io::Reader::new(fs::File::open(&output_bam).expect("open output BAM"));
let out_header = reader.read_header().expect("read output header");
let mut total_records = 0usize;
let mut duplicate_records = 0usize;
let mut non_duplicate_records = 0usize;
let mut mi_values = HashSet::new();
let mut mi_count = 0usize;
let mut non_dup_names = HashSet::new();
let mi_tag = Tag::from(SamTag::MI);
for result in reader.record_bufs(&out_header) {
let record: RecordBuf = result.expect("read record");
total_records += 1;
let is_dup = record.flags().is_duplicate();
if is_dup {
duplicate_records += 1;
} else {
non_duplicate_records += 1;
if let Some(name) = record.name() {
non_dup_names.insert(name.to_owned());
}
}
if let Some(DataValue::String(mi)) = record.data().get(&mi_tag) {
mi_values.insert(mi.to_owned());
mi_count += 1;
}
}
assert_eq!(total_records, NUM_TEMPLATES * 2, "all records should be present in output");
assert_eq!(
non_duplicate_records, 2,
"exactly one pair should be non-duplicate (the best-scoring template)"
);
assert_eq!(
duplicate_records,
(NUM_TEMPLATES - 1) * 2,
"all other pairs should be marked as duplicates"
);
assert_eq!(non_dup_names.len(), 1, "non-duplicate records should be from one template");
assert_eq!(mi_count, NUM_TEMPLATES * 2, "all records should have an MI tag");
assert_eq!(mi_values.len(), 1, "all records should share a single MI tag value");
}