use clap::Parser;
use fgumi_bam_io::create_raw_bam_writer;
use fgumi_dna::reverse_complement;
use fgumi_lib::commands::codec::Codec;
use fgumi_lib::commands::command::Command;
use fgumi_lib::sam::SamTag;
use fgumi_raw_bam::{RawRecord, SamBuilder, flags as raw_flags};
use noodles::bam;
use std::fs;
use std::path::PathBuf;
use tempfile::TempDir;
use crate::helpers::bam_generator::create_minimal_header;
#[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap, clippy::too_many_arguments)]
pub(crate) fn create_codec_read_pair(
name: &str,
r1_seq: &[u8],
r2_seq: &[u8],
r1_qual: &[u8],
r2_qual: &[u8],
ref_start: usize,
umi: &str,
cell_barcode: Option<&str>,
) -> (RawRecord, RawRecord) {
let r1_len = r1_seq.len();
let r2_len = r2_seq.len();
let r1_cigar_op = u32::try_from(r1_len).expect("r1_len fits u32") << 4; let r2_cigar_op = u32::try_from(r2_len).expect("r2_len fits u32") << 4; let pos = i32::try_from(ref_start).expect("ref_start fits i32") - 1;
let r1_mc_tag = format!("{r1_len}M");
let r2_mc_tag = format!("{r2_len}M");
let mut b1 = SamBuilder::new();
b1.read_name(name.as_bytes())
.sequence(r1_seq)
.qualities(r1_qual)
.cigar_ops(&[r1_cigar_op])
.flags(raw_flags::PAIRED | raw_flags::FIRST_SEGMENT | raw_flags::MATE_REVERSE)
.ref_id(0)
.pos(pos)
.mapq(60)
.mate_ref_id(0)
.mate_pos(pos)
.template_length(r1_len as i32)
.add_string_tag(SamTag::MI, umi.as_bytes())
.add_string_tag(SamTag::MC, r2_mc_tag.as_bytes());
if let Some(cb) = cell_barcode {
b1.add_string_tag(SamTag::CB, cb.as_bytes());
}
let mut b2 = SamBuilder::new();
b2.read_name(name.as_bytes())
.sequence(r2_seq)
.qualities(r2_qual)
.cigar_ops(&[r2_cigar_op])
.flags(raw_flags::PAIRED | raw_flags::LAST_SEGMENT | raw_flags::REVERSE)
.ref_id(0)
.pos(pos)
.mapq(60)
.mate_ref_id(0)
.mate_pos(pos)
.template_length(-(r2_len as i32))
.add_string_tag(SamTag::MI, umi.as_bytes())
.add_string_tag(SamTag::MC, r1_mc_tag.as_bytes());
if let Some(cb) = cell_barcode {
b2.add_string_tag(SamTag::CB, cb.as_bytes());
}
(b1.build(), b2.build())
}
pub(crate) fn create_codec_test_bam(path: &PathBuf, pairs: Vec<(RawRecord, RawRecord)>) {
let header = create_minimal_header("chr1", 10000);
let mut writer =
create_raw_bam_writer(path, &header, 1, 6).expect("Failed to create raw BAM writer");
for (r1, r2) in pairs {
writer.write_raw_record(r1.as_ref()).expect("Failed to write R1");
writer.write_raw_record(r2.as_ref()).expect("Failed to write R2");
}
writer.finish().expect("Failed to finish BAM");
}
#[test]
fn test_codec_command_basic_consensus() {
let temp_dir = TempDir::new().expect("Failed to create temp dir");
let input_bam = temp_dir.path().join("input.bam");
let output_bam = temp_dir.path().join("output.bam");
let mut pairs = Vec::new();
for i in 0..3 {
let (r1, r2) = create_codec_read_pair(
&format!("read{i}"),
b"ACGTACGT",
b"ACGTACGT",
&[30; 8],
&[30; 8],
100,
"UMI001",
None,
);
pairs.push((r1, r2));
}
create_codec_test_bam(&input_bam, pairs);
let cmd = Codec::try_parse_from([
"codec",
"--input",
input_bam.to_str().unwrap(),
"--output",
output_bam.to_str().unwrap(),
"--min-reads",
"1",
"--min-duplex-length",
"1",
"--compression-level",
"1",
])
.expect("failed to parse codec args");
cmd.execute("fgumi codec").expect("Failed to run codec command");
assert!(output_bam.exists(), "Output BAM not created");
let mut reader = bam::io::Reader::new(fs::File::open(&output_bam).unwrap());
let _header = reader.read_header().unwrap();
let mut consensus_count = 0;
for result in reader.records() {
let record = result.expect("Failed to read record");
consensus_count += 1;
let cd_tag = SamTag::CD.to_noodles_tag();
assert!(record.data().get(&cd_tag).is_some(), "Consensus should have cD tag");
}
assert!(consensus_count > 0, "Should have produced at least one consensus read");
}
#[test]
fn test_codec_command_with_stats() {
let temp_dir = TempDir::new().expect("Failed to create temp dir");
let input_bam = temp_dir.path().join("input.bam");
let output_bam = temp_dir.path().join("output.bam");
let stats_file = temp_dir.path().join("stats.tsv");
let mut pairs = Vec::new();
for i in 0..2 {
let (r1, r2) = create_codec_read_pair(
&format!("mol1_read{i}"),
b"ACGTACGT",
b"ACGTACGT",
&[30; 8],
&[30; 8],
100,
"UMI001",
None,
);
pairs.push((r1, r2));
}
for i in 0..3 {
let (r1, r2) = create_codec_read_pair(
&format!("mol2_read{i}"),
b"TGCATGCA",
b"TGCATGCA",
&[30; 8],
&[30; 8],
200,
"UMI002",
None,
);
pairs.push((r1, r2));
}
create_codec_test_bam(&input_bam, pairs);
let cmd = Codec::try_parse_from([
"codec",
"--input",
input_bam.to_str().unwrap(),
"--output",
output_bam.to_str().unwrap(),
"--stats",
stats_file.to_str().unwrap(),
"--min-reads",
"1",
"--min-duplex-length",
"1",
"--compression-level",
"1",
])
.expect("failed to parse codec args");
cmd.execute("fgumi codec").expect("Failed to run codec command");
assert!(stats_file.exists(), "Stats file not created");
let stats_content = fs::read_to_string(&stats_file).expect("Failed to read stats");
assert!(!stats_content.is_empty(), "Stats file should not be empty");
}
#[test]
fn test_codec_command_with_rejects() {
let temp_dir = TempDir::new().expect("Failed to create temp dir");
let input_bam = temp_dir.path().join("input.bam");
let output_bam = temp_dir.path().join("output.bam");
let rejects_bam = temp_dir.path().join("rejects.bam");
let mut pairs = Vec::new();
for i in 0..3 {
let (r1, r2) = create_codec_read_pair(
&format!("pass_read{i}"),
b"ACGTACGT",
b"ACGTACGT",
&[30; 8],
&[30; 8],
100,
"UMI_PASS",
None,
);
pairs.push((r1, r2));
}
let (r1, r2) = create_codec_read_pair(
"fail_read0",
b"TGCATGCA",
b"TGCATGCA",
&[30; 8],
&[30; 8],
200,
"UMI_FAIL",
None,
);
pairs.push((r1, r2));
create_codec_test_bam(&input_bam, pairs);
let cmd = Codec::try_parse_from([
"codec",
"--input",
input_bam.to_str().unwrap(),
"--output",
output_bam.to_str().unwrap(),
"--rejects",
rejects_bam.to_str().unwrap(),
"--min-reads",
"3",
"--min-duplex-length",
"1",
"--compression-level",
"1",
])
.expect("failed to parse codec args");
cmd.execute("fgumi codec").expect("Failed to run codec command");
assert!(output_bam.exists(), "Output BAM not created");
assert!(rejects_bam.exists(), "Rejects BAM not created");
let mut reader = bam::io::Reader::new(fs::File::open(&output_bam).unwrap());
let _header = reader.read_header().unwrap();
let consensus_count = reader.records().count();
assert!(consensus_count >= 1, "Should have consensus from passing molecule");
let mut rejects_reader = bam::io::Reader::new(fs::File::open(&rejects_bam).unwrap());
let _rejects_header = rejects_reader.read_header().unwrap();
let rejects_count = rejects_reader.records().count();
assert_eq!(rejects_count, 2, "Rejects BAM should contain both reads of the failing molecule");
}
#[test]
fn test_codec_command_min_duplex_length() {
let temp_dir = TempDir::new().expect("Failed to create temp dir");
let input_bam = temp_dir.path().join("input.bam");
let output_bam = temp_dir.path().join("output.bam");
let mut pairs = Vec::new();
for i in 0..3 {
let (r1, r2) = create_codec_read_pair(
&format!("read{i}"),
b"ACGTACGT",
b"ACGTACGT",
&[30; 8],
&[30; 8],
100,
"UMI001",
None,
);
pairs.push((r1, r2));
}
create_codec_test_bam(&input_bam, pairs);
let cmd = Codec::try_parse_from([
"codec",
"--input",
input_bam.to_str().unwrap(),
"--output",
output_bam.to_str().unwrap(),
"--min-reads",
"1",
"--min-duplex-length",
"100", "--compression-level",
"1",
])
.expect("failed to parse codec args");
cmd.execute("fgumi codec").expect("Failed to run codec command");
let mut reader = bam::io::Reader::new(fs::File::open(&output_bam).unwrap());
let _header = reader.read_header().unwrap();
let consensus_count = reader.records().count();
assert_eq!(consensus_count, 0, "Should have no consensus due to min-duplex-length filter");
}
#[test]
fn test_codec_command_per_base_tags() {
let temp_dir = TempDir::new().expect("Failed to create temp dir");
let input_bam = temp_dir.path().join("input.bam");
let output_bam = temp_dir.path().join("output.bam");
let mut pairs = Vec::new();
for i in 0..3 {
let (r1, r2) = create_codec_read_pair(
&format!("read{i}"),
b"ACGTACGT",
b"ACGTACGT",
&[30; 8],
&[30; 8],
100,
"UMI001",
None,
);
pairs.push((r1, r2));
}
create_codec_test_bam(&input_bam, pairs);
let cmd = Codec::try_parse_from([
"codec",
"--input",
input_bam.to_str().unwrap(),
"--output",
output_bam.to_str().unwrap(),
"--min-reads",
"1",
"--min-duplex-length",
"1",
"--output-per-base-tags",
"--compression-level",
"1",
])
.expect("failed to parse codec args");
cmd.execute("fgumi codec").expect("Failed to run codec command");
let mut reader = bam::io::Reader::new(fs::File::open(&output_bam).unwrap());
let _header = reader.read_header().unwrap();
for result in reader.records() {
let record = result.expect("Failed to read record");
let ad_tag = SamTag::AD_BASES.to_noodles_tag();
let bd_tag = SamTag::BD_BASES.to_noodles_tag();
assert!(record.data().get(&ad_tag).is_some(), "Should have per-base depth tag 'ad'");
assert!(record.data().get(&bd_tag).is_some(), "Should have per-base depth tag 'bd'");
}
}
#[test]
fn test_codec_command_cell_barcode_preservation() {
let temp_dir = TempDir::new().expect("Failed to create temp dir");
let input_bam = temp_dir.path().join("input.bam");
let output_bam = temp_dir.path().join("output.bam");
let mut pairs = Vec::new();
for i in 0..3 {
let (r1, r2) = create_codec_read_pair(
&format!("read{i}"),
b"ACGTACGT",
b"ACGTACGT",
&[30; 8],
&[30; 8],
100,
"UMI001",
Some("CELLBC123"),
);
pairs.push((r1, r2));
}
create_codec_test_bam(&input_bam, pairs);
let cmd = Codec::try_parse_from([
"codec",
"--input",
input_bam.to_str().unwrap(),
"--output",
output_bam.to_str().unwrap(),
"--min-reads",
"1",
"--min-duplex-length",
"1",
"--compression-level",
"1",
])
.expect("failed to parse codec args");
cmd.execute("fgumi codec").expect("Failed to run codec command");
assert!(output_bam.exists(), "Output BAM not created");
let mut reader = bam::io::Reader::new(fs::File::open(&output_bam).unwrap());
let _header = reader.read_header().unwrap();
let mut consensus_count = 0;
for result in reader.records() {
let record = result.expect("Failed to read record");
consensus_count += 1;
let cb_tag = SamTag::CB.to_noodles_tag();
assert!(
record.data().get(&cb_tag).is_some(),
"Consensus should have CB (cell barcode) tag"
);
}
assert!(consensus_count > 0, "Should have produced at least one consensus read");
}
#[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
fn create_offset_codec_pair(name: &str, umi: &str) -> (RawRecord, RawRecord) {
const REF: &[u8; 40] = b"AAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTT";
const READ_LEN: usize = 30;
const QUAL: u8 = 35;
let r1_seq = REF[..READ_LEN].to_vec();
let r2_seq_fwd = REF[10..10 + READ_LEN].to_vec();
let r2_seq = reverse_complement(&r2_seq_fwd);
let cigar_op = (READ_LEN as u32) << 4;
let r1_pos = 0_i32; let r2_pos = 10_i32; let mc_tag = format!("{READ_LEN}M");
let template_len = (10 + READ_LEN) as i32;
let mut b1 = SamBuilder::new();
b1.read_name(name.as_bytes())
.sequence(&r1_seq)
.qualities(&[QUAL; READ_LEN])
.cigar_ops(&[cigar_op])
.flags(raw_flags::PAIRED | raw_flags::FIRST_SEGMENT | raw_flags::MATE_REVERSE)
.ref_id(0)
.pos(r1_pos)
.mapq(60)
.mate_ref_id(0)
.mate_pos(r2_pos)
.template_length(template_len)
.add_string_tag(SamTag::MI, umi.as_bytes())
.add_string_tag(SamTag::MC, mc_tag.as_bytes());
let mut b2 = SamBuilder::new();
b2.read_name(name.as_bytes())
.sequence(&r2_seq)
.qualities(&[QUAL; READ_LEN])
.cigar_ops(&[cigar_op])
.flags(raw_flags::PAIRED | raw_flags::LAST_SEGMENT | raw_flags::REVERSE)
.ref_id(0)
.pos(r2_pos)
.mapq(60)
.mate_ref_id(0)
.mate_pos(r1_pos)
.template_length(-template_len)
.add_string_tag(SamTag::MI, umi.as_bytes())
.add_string_tag(SamTag::MC, mc_tag.as_bytes());
(b1.build(), b2.build())
}
#[test]
fn test_codec_command_recovers_from_duplex_disagreement() {
let temp_dir = TempDir::new().expect("Failed to create temp dir");
let input_bam = temp_dir.path().join("input.bam");
let output_bam = temp_dir.path().join("output.bam");
let (r1, r2) = create_offset_codec_pair("disagree_read", "UMI_DISAGREE");
create_codec_test_bam(&input_bam, vec![(r1, r2)]);
let cmd = Codec::try_parse_from([
"codec",
"--input",
input_bam.to_str().unwrap(),
"--output",
output_bam.to_str().unwrap(),
"--min-reads",
"1",
"--min-duplex-length",
"1",
"--max-duplex-disagreements",
"0",
"--compression-level",
"1",
])
.expect("failed to parse codec args");
cmd.execute("fgumi codec")
.expect("Codec command must succeed when only failure is a recoverable reject");
let mut reader = bam::io::Reader::new(fs::File::open(&output_bam).unwrap());
let _header = reader.read_header().unwrap();
let consensus_count = reader.records().count();
assert_eq!(consensus_count, 0, "Disagreeing molecule should produce no consensus");
}
#[test]
fn test_codec_command_recovers_from_duplex_disagreement_threaded() {
let temp_dir = TempDir::new().expect("Failed to create temp dir");
let input_bam = temp_dir.path().join("input.bam");
let output_bam = temp_dir.path().join("output.bam");
let rejects_bam = temp_dir.path().join("rejects.bam");
let (r1, r2) = create_offset_codec_pair("disagree_read_mt", "UMI_DISAGREE_MT");
create_codec_test_bam(&input_bam, vec![(r1, r2)]);
let cmd = Codec::try_parse_from([
"codec",
"--input",
input_bam.to_str().unwrap(),
"--output",
output_bam.to_str().unwrap(),
"--rejects",
rejects_bam.to_str().unwrap(),
"--threads",
"2",
"--min-reads",
"1",
"--min-duplex-length",
"1",
"--max-duplex-disagreements",
"0",
"--compression-level",
"1",
])
.expect("failed to parse codec args");
cmd.execute("fgumi codec")
.expect("Threaded codec command must succeed when only failure is a recoverable reject");
let mut reader = bam::io::Reader::new(fs::File::open(&output_bam).unwrap());
let _header = reader.read_header().unwrap();
let consensus_count = reader.records().count();
assert_eq!(consensus_count, 0, "Disagreeing molecule should produce no consensus");
assert!(rejects_bam.exists(), "Rejects BAM file should be created");
let mut rejects_reader = bam::io::Reader::new(fs::File::open(&rejects_bam).unwrap());
let _ = rejects_reader.read_header().unwrap();
let reject_count = rejects_reader.records().count();
assert_eq!(
reject_count, 0,
"Disagreement short-circuits before reject tracking; rejects file is empty"
);
}