mod helpers;
use std::path::PathBuf;
use helpers::{FastaBuilder, SamBuilder, coord_builder, read_metrics_tsv};
use noodles::core::Position;
use noodles::sam::alignment::RecordBuf;
use noodles::sam::alignment::record::{
Flags, MappingQuality,
cigar::{Op, op::Kind},
};
use noodles::sam::alignment::record_buf::{Cigar, QualityScores, Sequence};
use riker_lib::commands::command::Command;
use riker_lib::commands::common::{InputOptions, OutputOptions};
use riker_lib::commands::error::{
Error, ErrorOptions, IndelMetric, MismatchMetric, OverlappingMismatchMetric,
};
#[expect(clippy::too_many_arguments)]
fn make_pair(
builder: &mut SamBuilder,
name: &str,
ref_id: usize,
pos1: usize,
pos2: usize,
tlen: i32,
seq1: &[u8],
seq2: &[u8],
quals: &[u8],
) {
let read_len = seq1.len();
let mapq = MappingQuality::new(60).unwrap();
let cigar: Cigar = [Op::new(Kind::Match, read_len)].into_iter().collect();
let r1 = RecordBuf::builder()
.set_name(name)
.set_flags(
Flags::SEGMENTED
| Flags::PROPERLY_SEGMENTED
| Flags::MATE_REVERSE_COMPLEMENTED
| Flags::FIRST_SEGMENT,
)
.set_reference_sequence_id(ref_id)
.set_alignment_start(Position::new(pos1).unwrap())
.set_mapping_quality(mapq)
.set_cigar(cigar.clone())
.set_mate_reference_sequence_id(ref_id)
.set_mate_alignment_start(Position::new(pos2).unwrap())
.set_template_length(tlen)
.set_sequence(Sequence::from(seq1.to_vec()))
.set_quality_scores(QualityScores::from(quals.to_vec()))
.build();
let r2 = RecordBuf::builder()
.set_name(name)
.set_flags(
Flags::SEGMENTED
| Flags::PROPERLY_SEGMENTED
| Flags::REVERSE_COMPLEMENTED
| Flags::LAST_SEGMENT,
)
.set_reference_sequence_id(ref_id)
.set_alignment_start(Position::new(pos2).unwrap())
.set_mapping_quality(mapq)
.set_cigar(cigar)
.set_mate_reference_sequence_id(ref_id)
.set_mate_alignment_start(Position::new(pos1).unwrap())
.set_template_length(-tlen)
.set_sequence(Sequence::from(seq2.to_vec()))
.set_quality_scores(QualityScores::from(quals.to_vec()))
.build();
builder.add_record(r1);
builder.add_record(r2);
}
fn run_error(
bam_path: &std::path::Path,
fasta_path: &std::path::Path,
stratify_by: Vec<String>,
) -> PathBuf {
let dir = tempfile::tempdir().unwrap();
let prefix = dir.path().join("out");
let cmd = Error {
input: InputOptions { input: bam_path.to_path_buf() },
output: OutputOptions { output: prefix.clone() },
options: ErrorOptions {
reference: fasta_path.to_path_buf(),
vcf: None,
intervals: None,
min_mapq: 20,
min_bq: 0, include_duplicates: false,
max_isize: 1000,
picard_compat: false,
stratify_by,
},
};
cmd.execute().expect("error command should succeed");
let path = prefix.clone();
std::mem::forget(dir);
path
}
fn test_reference() -> (tempfile::NamedTempFile, Vec<u8>) {
let seq: Vec<u8> = (0..1000).map(|i| b"ACGT"[i % 4]).collect();
let fasta = FastaBuilder::new().add_contig("chr1", &seq).to_temp_fasta().unwrap();
(fasta, seq)
}
#[test]
fn test_no_errors() {
let (fasta, ref_seq) = test_reference();
let seq = ref_seq[99..109].to_vec(); let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
make_pair(&mut builder, "read1", 0, 100, 120, 30, &seq, &ref_seq[119..129], &quals);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
assert!(!mm.is_empty());
let all_row = mm.iter().find(|r| r.stratifier == "all").unwrap();
assert!(all_row.total_bases > 0);
assert_eq!(all_row.error_bases, 0);
assert_float_eq!(all_row.frac_error, 0.0, 1e-9);
}
#[test]
fn test_simple_mismatches() {
let (fasta, ref_seq) = test_reference();
let mut seq = ref_seq[99..109].to_vec(); seq[0] = b'T'; let mut seq = ref_seq[100..110].to_vec(); seq[0] = b'G'; seq[5] = b'T'; let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap()) .set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
let all_row = mm.iter().find(|r| r.stratifier == "all").unwrap();
assert_eq!(all_row.total_bases, 10);
assert_eq!(all_row.error_bases, 2);
assert!((all_row.frac_error - 0.2).abs() < 1e-6);
}
#[test]
fn test_stratify_by_strand() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let mut seq_fwd = ref_seq[100..110].to_vec();
seq_fwd[0] = b'G'; let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let r_fwd = RecordBuf::builder()
.set_name("read_fwd")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar.clone())
.set_sequence(Sequence::from(seq_fwd))
.set_quality_scores(QualityScores::from(quals.clone()))
.build();
let seq_rev = ref_seq[200..210].to_vec();
let r_rev = RecordBuf::builder()
.set_name("read_rev")
.set_flags(Flags::REVERSE_COMPLEMENTED)
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(201).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq_rev))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(r_fwd);
builder.add_record(r_rev);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec!["strand".to_string()]);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
let fwd = mm.iter().find(|r| r.stratifier == "strand" && r.covariate == "+").unwrap();
let rev = mm.iter().find(|r| r.stratifier == "strand" && r.covariate == "-").unwrap();
assert_eq!(fwd.total_bases, 10);
assert_eq!(fwd.error_bases, 1);
assert_eq!(rev.total_bases, 10);
assert_eq!(rev.error_bases, 0);
}
#[test]
fn test_insertion_detection() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 13];
let mut builder = coord_builder(&[("chr1", 1000)]);
let mut seq = ref_seq[100..110].to_vec();
seq.extend_from_slice(b"AAA"); let cigar: Cigar =
[Op::new(Kind::Match, 10), Op::new(Kind::Insertion, 3)].into_iter().collect();
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let indels: Vec<IndelMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-indel.txt")).unwrap();
let all_row = indels.iter().find(|r| r.stratifier == "all").unwrap();
assert_eq!(all_row.total_bases, 10); assert_eq!(all_row.num_insertions, 1);
assert_eq!(all_row.num_inserted_bases, 3);
assert_eq!(all_row.num_deletions, 0);
}
#[test]
fn test_deletion_detection() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let mut seq = ref_seq[100..105].to_vec(); seq.extend_from_slice(&ref_seq[107..112]); let cigar: Cigar =
[Op::new(Kind::Match, 5), Op::new(Kind::Deletion, 2), Op::new(Kind::Match, 5)]
.into_iter()
.collect();
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let indels: Vec<IndelMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-indel.txt")).unwrap();
let all_row = indels.iter().find(|r| r.stratifier == "all").unwrap();
assert_eq!(all_row.total_bases, 10); assert_eq!(all_row.num_deletions, 1);
assert_eq!(all_row.num_deleted_bases, 2);
assert_eq!(all_row.num_insertions, 0);
}
#[test]
fn test_min_mapq_filter() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let seq = ref_seq[100..110].to_vec();
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(10).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
if let Some(all_row) = mm.iter().find(|r| r.stratifier == "all") {
assert_eq!(all_row.total_bases, 0);
}
}
#[test]
fn test_duplicate_exclusion() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let seq = ref_seq[100..110].to_vec();
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let record = RecordBuf::builder()
.set_name("read_dup")
.set_flags(Flags::DUPLICATE)
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar.clone())
.set_sequence(Sequence::from(seq.clone()))
.set_quality_scores(QualityScores::from(quals.clone()))
.build();
let mut seq2 = ref_seq[100..110].to_vec();
seq2[0] = b'G';
let record2 = RecordBuf::builder()
.set_name("read_ok")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq2))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
builder.add_record(record2);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
let all_row = mm.iter().find(|r| r.stratifier == "all").unwrap();
assert_eq!(all_row.total_bases, 10);
assert_eq!(all_row.error_bases, 1);
}
#[test]
fn test_all_group_always_present() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let seq = ref_seq[100..110].to_vec();
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec!["bq".to_string()]);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
assert!(mm.iter().any(|r| r.stratifier == "all"));
assert!(mm.iter().any(|r| r.stratifier == "bq"));
}
#[test]
fn test_composite_stratifier() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let seq = ref_seq[100..110].to_vec();
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec!["strand,mapq".to_string()]);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
let composite_rows: Vec<_> = mm.iter().filter(|r| r.stratifier == "strand,mapq").collect();
assert!(!composite_rows.is_empty());
assert!(composite_rows.iter().any(|r| r.covariate.contains(',')));
}
#[test]
fn test_overlapping_reads_mismatching_ref_and_mate() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let seq1 = ref_seq[100..110].to_vec();
let mut seq2 = ref_seq[105..115].to_vec();
seq2[2] = b'N'; let mut seq2 = ref_seq[105..115].to_vec();
seq2[2] = b'A';
make_pair(
&mut builder,
"pair1",
0,
101, 106, 15, &seq1,
&seq2,
&quals,
);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let ov: Vec<OverlappingMismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-overlap.txt")).unwrap();
if let Some(all_row) = ov.iter().find(|r| r.stratifier == "all") {
assert!(all_row.overlapping_read_bases > 0, "Expected overlapping bases");
}
}
#[test]
fn test_three_output_files_always_created() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let seq = ref_seq[100..110].to_vec();
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
assert!(prefix.with_file_name("out.error-mismatch.txt").exists());
assert!(prefix.with_file_name("out.error-overlap.txt").exists());
assert!(prefix.with_file_name("out.error-indel.txt").exists());
}
fn run_error_with_bq(
bam_path: &std::path::Path,
fasta_path: &std::path::Path,
stratify_by: Vec<String>,
min_bq: u8,
) -> PathBuf {
let dir = tempfile::tempdir().unwrap();
let prefix = dir.path().join("out");
let cmd = Error {
input: InputOptions { input: bam_path.to_path_buf() },
output: OutputOptions { output: prefix.clone() },
options: ErrorOptions {
reference: fasta_path.to_path_buf(),
vcf: None,
intervals: None,
min_mapq: 20,
min_bq,
include_duplicates: false,
max_isize: 1000,
picard_compat: false,
stratify_by,
},
};
cmd.execute().expect("error command should succeed");
let path = prefix.clone();
std::mem::forget(dir);
path
}
#[test]
fn test_min_bq_filter() {
let (fasta, ref_seq) = test_reference();
let seq = ref_seq[100..110].to_vec(); let mut quals = vec![10u8; 10];
for q in quals.iter_mut().skip(5) {
*q = 30;
}
let mut builder = coord_builder(&[("chr1", 1000)]);
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error_with_bq(bam.path(), fasta.path(), vec![], 20);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
let all_row = mm.iter().find(|r| r.stratifier == "all").unwrap();
assert_eq!(all_row.total_bases, 5);
}
#[test]
fn test_n_bases_excluded() {
let (fasta, ref_seq) = test_reference();
let mut seq = ref_seq[100..110].to_vec();
seq[4] = b'N';
seq[9] = b'N';
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
let all_row = mm.iter().find(|r| r.stratifier == "all").unwrap();
assert_eq!(all_row.total_bases, 8);
assert_eq!(all_row.error_bases, 0);
}
#[test]
fn test_soft_clip_excluded() {
let (fasta, ref_seq) = test_reference();
let mut seq = vec![b'A'; 3]; seq.extend_from_slice(&ref_seq[100..107]); let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let cigar: Cigar = [Op::new(Kind::SoftClip, 3), Op::new(Kind::Match, 7)].into_iter().collect();
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
let all_row = mm.iter().find(|r| r.stratifier == "all").unwrap();
assert_eq!(all_row.total_bases, 7);
}
#[test]
fn test_secondary_supplementary_excluded() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let mut primary_seq = ref_seq[100..110].to_vec();
primary_seq[0] = b'G'; let primary = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar.clone())
.set_sequence(Sequence::from(primary_seq))
.set_quality_scores(QualityScores::from(quals.clone()))
.build();
let mut secondary_seq = ref_seq[100..110].to_vec();
secondary_seq[1] = b'T'; let secondary = RecordBuf::builder()
.set_name("read2")
.set_flags(Flags::SECONDARY)
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar.clone())
.set_sequence(Sequence::from(secondary_seq))
.set_quality_scores(QualityScores::from(quals.clone()))
.build();
let mut supplementary_seq = ref_seq[100..110].to_vec();
supplementary_seq[2] = b'T'; let supplementary = RecordBuf::builder()
.set_name("read3")
.set_flags(Flags::SUPPLEMENTARY)
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(supplementary_seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(primary);
builder.add_record(secondary);
builder.add_record(supplementary);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
let all_row = mm.iter().find(|r| r.stratifier == "all").unwrap();
assert_eq!(all_row.total_bases, 10);
assert_eq!(all_row.error_bases, 1);
}
#[test]
fn test_mixed_insertion_and_deletion() {
let (fasta, ref_seq) = test_reference();
let mut seq = ref_seq[100..105].to_vec(); seq.extend_from_slice(b"AA"); seq.extend_from_slice(&ref_seq[105..110]); seq.extend_from_slice(&ref_seq[113..118]); let quals = vec![30u8; 17];
let mut builder = coord_builder(&[("chr1", 1000)]);
let cigar: Cigar = [
Op::new(Kind::Match, 5),
Op::new(Kind::Insertion, 2),
Op::new(Kind::Match, 5),
Op::new(Kind::Deletion, 3),
Op::new(Kind::Match, 5),
]
.into_iter()
.collect();
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let indels: Vec<IndelMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-indel.txt")).unwrap();
let all_row = indels.iter().find(|r| r.stratifier == "all").unwrap();
assert_eq!(all_row.total_bases, 15);
assert_eq!(all_row.num_insertions, 1);
assert_eq!(all_row.num_inserted_bases, 2);
assert_eq!(all_row.num_deletions, 1);
assert_eq!(all_row.num_deleted_bases, 3);
}
#[test]
fn test_q_score_calculation() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
for i in 0..100 {
let mut seq = ref_seq[100..110].to_vec();
if i == 0 {
seq[0] = b'G'; }
let record = RecordBuf::builder()
.set_name(format!("read{i}").as_str())
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar.clone())
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals.clone()))
.build();
builder.add_record(record);
}
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
let all_row = mm.iter().find(|r| r.stratifier == "all").unwrap();
assert_eq!(all_row.total_bases, 1000);
assert_eq!(all_row.error_bases, 1);
assert_float_eq!(all_row.q_score, 30.0, 0.01);
}
#[test]
fn test_stratify_by_cycle() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let mut seq_fwd = ref_seq[100..110].to_vec();
seq_fwd[2] = b'T'; let fwd = RecordBuf::builder()
.set_name("read_fwd")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar.clone())
.set_sequence(Sequence::from(seq_fwd))
.set_quality_scores(QualityScores::from(quals.clone()))
.build();
let mut seq_rev = ref_seq[200..210].to_vec();
seq_rev[7] = b'A'; let rev = RecordBuf::builder()
.set_name("read_rev")
.set_flags(Flags::REVERSE_COMPLEMENTED)
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(201).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq_rev))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(fwd);
builder.add_record(rev);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec!["cycle".to_string()]);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
let cycle3 = mm.iter().find(|r| r.stratifier == "cycle" && r.covariate == "3").unwrap();
assert_eq!(cycle3.error_bases, 2);
}
#[test]
fn test_stratify_by_read_num() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let mut seq1 = ref_seq[100..110].to_vec();
seq1[0] = b'G'; let seq2 = ref_seq[119..129].to_vec();
make_pair(&mut builder, "pair1", 0, 101, 120, 29, &seq1, &seq2, &quals);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec!["read_num".to_string()]);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
let r1 = mm.iter().find(|r| r.stratifier == "read_num" && r.covariate == "R1").unwrap();
let r2 = mm.iter().find(|r| r.stratifier == "read_num" && r.covariate == "R2").unwrap();
assert_eq!(r1.error_bases, 1);
assert_eq!(r2.error_bases, 0);
}
#[test]
fn test_stratify_by_ref_base() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let mut seq = ref_seq[100..110].to_vec();
seq[0] = b'G';
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec!["ref_base".to_string()]);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
let ref_a = mm.iter().find(|r| r.stratifier == "ref_base" && r.covariate == "A").unwrap();
assert!(ref_a.error_bases >= 1);
let ref_c = mm.iter().find(|r| r.stratifier == "ref_base" && r.covariate == "C").unwrap();
assert_eq!(ref_c.error_bases, 0);
}
#[test]
fn test_stratify_by_hp_len() {
let mut custom_seq: Vec<u8> = Vec::with_capacity(1000);
custom_seq.extend((0..100).map(|i| b"ACGT"[i % 4]));
custom_seq.extend_from_slice(b"AAAA");
custom_seq.extend((104..1000).map(|i| b"ACGT"[i % 4]));
let fasta = FastaBuilder::new().add_contig("chr1", &custom_seq).to_temp_fasta().unwrap();
let seq = custom_seq[97..107].to_vec(); let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(98).unwrap()) .set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec!["hp_len".to_string()]);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
let hp_rows: Vec<_> = mm.iter().filter(|r| r.stratifier == "hp_len").collect();
assert!(!hp_rows.is_empty());
assert!(hp_rows.iter().any(|r| r.covariate == "0"));
assert!(hp_rows.iter().any(|r| {
let val: i64 = r.covariate.parse().unwrap_or(0);
val > 0
}));
}
#[test]
fn test_stratify_by_indel_len() {
let (fasta, ref_seq) = test_reference();
let mut seq = ref_seq[100..105].to_vec();
seq.extend_from_slice(b"AAA"); seq.extend_from_slice(&ref_seq[105..110]);
let quals = vec![30u8; 13];
let mut builder = coord_builder(&[("chr1", 1000)]);
let cigar: Cigar =
[Op::new(Kind::Match, 5), Op::new(Kind::Insertion, 3), Op::new(Kind::Match, 5)]
.into_iter()
.collect();
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec!["indel_len".to_string()]);
let indels: Vec<IndelMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-indel.txt")).unwrap();
let indel_3 = indels.iter().find(|r| r.stratifier == "indel_len" && r.covariate == "3");
assert!(indel_3.is_some(), "Expected indel_len=3 row for the 3bp insertion");
let indel_3 = indel_3.unwrap();
assert_eq!(indel_3.num_insertions, 1);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
let mm_0 = mm.iter().find(|r| r.stratifier == "indel_len" && r.covariate == "0");
assert!(mm_0.is_some(), "Expected indel_len=0 row for aligned bases in mismatch metrics");
let mm_0 = mm_0.unwrap();
assert_eq!(mm_0.total_bases, 10); }
#[test]
fn test_overlapping_reads_all_agree() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let seq1 = ref_seq[100..110].to_vec(); let seq2 = ref_seq[105..115].to_vec();
make_pair(&mut builder, "pair1", 0, 101, 106, 15, &seq1, &seq2, &quals);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let ov: Vec<OverlappingMismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-overlap.txt")).unwrap();
let all_row = ov.iter().find(|r| r.stratifier == "all").unwrap();
assert!(all_row.overlapping_read_bases > 0, "Expected overlapping bases");
assert_eq!(all_row.bases_mismatching_ref_and_mate, 0);
assert_eq!(all_row.bases_matching_mate_but_not_ref, 0);
assert_eq!(all_row.bases_in_three_way_disagreement, 0);
}
#[test]
fn test_overlapping_reads_matching_mate_but_not_ref() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let mut seq1 = ref_seq[100..110].to_vec(); seq1[7] = b'A';
let mut seq2 = ref_seq[105..115].to_vec(); seq2[2] = b'A';
make_pair(&mut builder, "pair1", 0, 101, 106, 15, &seq1, &seq2, &quals);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let ov: Vec<OverlappingMismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-overlap.txt")).unwrap();
let all_row = ov.iter().find(|r| r.stratifier == "all").unwrap();
assert!(all_row.overlapping_read_bases > 0);
assert!(
all_row.bases_matching_mate_but_not_ref > 0,
"Expected bases_matching_mate_but_not_ref > 0, got {}",
all_row.bases_matching_mate_but_not_ref
);
}
#[test]
fn test_overlapping_reads_three_way() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let mut seq1 = ref_seq[100..110].to_vec(); seq1[7] = b'A';
let mut seq2 = ref_seq[105..115].to_vec(); seq2[2] = b'C';
make_pair(&mut builder, "pair1", 0, 101, 106, 15, &seq1, &seq2, &quals);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let ov: Vec<OverlappingMismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-overlap.txt")).unwrap();
let all_row = ov.iter().find(|r| r.stratifier == "all").unwrap();
assert!(all_row.overlapping_read_bases > 0);
assert!(
all_row.bases_in_three_way_disagreement > 0,
"Expected bases_in_three_way_disagreement > 0, got {}",
all_row.bases_in_three_way_disagreement
);
}
#[test]
fn test_overlapping_reads_mismatching_ref_and_mate_exact() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let seq1 = ref_seq[100..110].to_vec();
let mut seq2 = ref_seq[105..115].to_vec();
seq2[2] = b'A';
make_pair(&mut builder, "pair1", 0, 101, 106, 15, &seq1, &seq2, &quals);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let ov: Vec<OverlappingMismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-overlap.txt")).unwrap();
let all_row = ov.iter().find(|r| r.stratifier == "all").unwrap();
assert!(all_row.overlapping_read_bases > 0, "Expected overlapping bases");
assert_eq!(all_row.bases_mismatching_ref_and_mate, 1);
assert_eq!(all_row.bases_matching_mate_but_not_ref, 0);
assert_eq!(all_row.bases_in_three_way_disagreement, 0);
}
#[test]
fn test_multi_contig() {
let seq1: Vec<u8> = (0..500).map(|i| b"ACGT"[i % 4]).collect();
let seq2: Vec<u8> = (0..500).map(|i| b"ACGT"[i % 4]).collect();
let fasta = FastaBuilder::new()
.add_contig("chr1", &seq1)
.add_contig("chr2", &seq2)
.to_temp_fasta()
.unwrap();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 500), ("chr2", 500)]);
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let read_seq1 = seq1[100..110].to_vec();
let r1 = RecordBuf::builder()
.set_name("read_chr1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar.clone())
.set_sequence(Sequence::from(read_seq1))
.set_quality_scores(QualityScores::from(quals.clone()))
.build();
let read_seq2 = seq2[100..110].to_vec();
let r2 = RecordBuf::builder()
.set_name("read_chr2")
.set_flags(Flags::empty())
.set_reference_sequence_id(1)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(read_seq2))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(r1);
builder.add_record(r2);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
let all_row = mm.iter().find(|r| r.stratifier == "all").unwrap();
assert_eq!(all_row.total_bases, 20);
}
#[test]
fn test_deletion_at_read_start() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let seq = ref_seq[102..112].to_vec(); let cigar: Cigar = [Op::new(Kind::Deletion, 2), Op::new(Kind::Match, 10)].into_iter().collect();
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let indels: Vec<IndelMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-indel.txt")).unwrap();
let all_row = indels.iter().find(|r| r.stratifier == "all").unwrap();
assert_eq!(all_row.total_bases, 10);
assert_eq!(all_row.num_deletions, 0);
}
#[test]
fn test_insertion_at_read_start() {
let (fasta, ref_seq) = test_reference();
let mut seq = vec![b'A'; 3]; seq.extend_from_slice(&ref_seq[100..110]); let quals = vec![30u8; 13];
let mut builder = coord_builder(&[("chr1", 1000)]);
let cigar: Cigar =
[Op::new(Kind::Insertion, 3), Op::new(Kind::Match, 10)].into_iter().collect();
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let indels: Vec<IndelMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-indel.txt")).unwrap();
let all_row = indels.iter().find(|r| r.stratifier == "all").unwrap();
assert_eq!(all_row.num_insertions, 0);
}
#[test]
fn test_stratifier_parse_error() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let seq = ref_seq[100..110].to_vec();
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let dir = tempfile::tempdir().unwrap();
let prefix = dir.path().join("out");
let cmd = Error {
input: InputOptions { input: bam.path().to_path_buf() },
output: OutputOptions { output: prefix },
options: ErrorOptions {
reference: fasta.path().to_path_buf(),
vcf: None,
intervals: None,
min_mapq: 20,
min_bq: 0,
include_duplicates: false,
max_isize: 1000,
picard_compat: false,
stratify_by: vec!["invalid_name".to_string()],
},
};
let result = cmd.execute();
assert!(result.is_err(), "Expected error for invalid stratifier name");
}
#[test]
fn test_gc_stratification() {
let (fasta, ref_seq) = test_reference();
let seq = ref_seq[100..110].to_vec();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec!["gc".to_string()]);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
let gc_rows: Vec<_> = mm.iter().filter(|r| r.stratifier == "gc").collect();
assert!(!gc_rows.is_empty());
let gc50 = gc_rows.iter().find(|r| r.covariate == "50");
assert!(
gc50.is_some(),
"Expected gc=50 row, found: {:?}",
gc_rows.iter().map(|r| &r.covariate).collect::<Vec<_>>()
);
assert_eq!(gc50.unwrap().total_bases, 10);
}
#[test]
fn test_pre_dinuc_stratification() {
let (fasta, ref_seq) = test_reference();
let seq = ref_seq[100..110].to_vec();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec!["pre_dinuc".to_string()]);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
let dinuc_rows: Vec<_> = mm.iter().filter(|r| r.stratifier == "pre_dinuc").collect();
assert!(!dinuc_rows.is_empty());
assert!(
dinuc_rows.iter().any(|r| r.covariate == "AC"),
"Expected 'AC' dinuc, found: {:?}",
dinuc_rows.iter().map(|r| &r.covariate).collect::<Vec<_>>()
);
}
#[test]
fn test_context_3bp_stratification() {
let (fasta, ref_seq) = test_reference();
let seq = ref_seq[100..110].to_vec();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec!["context_3bp".to_string()]);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
let ctx_rows: Vec<_> = mm.iter().filter(|r| r.stratifier == "context_3bp").collect();
assert!(!ctx_rows.is_empty());
assert!(
ctx_rows.iter().any(|r| r.covariate == "ACG"),
"Expected 'ACG' context, found: {:?}",
ctx_rows.iter().map(|r| &r.covariate).collect::<Vec<_>>()
);
}
#[test]
fn test_orphaned_buffered_read_still_counted() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let mut seq = ref_seq[100..110].to_vec(); seq[5] = b'G';
let mapq = MappingQuality::new(60).unwrap();
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let r1 = RecordBuf::builder()
.set_name("orphan")
.set_flags(
Flags::SEGMENTED
| Flags::PROPERLY_SEGMENTED
| Flags::MATE_REVERSE_COMPLEMENTED
| Flags::FIRST_SEGMENT,
)
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(mapq)
.set_cigar(cigar)
.set_mate_reference_sequence_id(0)
.set_mate_alignment_start(Position::new(106).unwrap()) .set_template_length(15) .set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(r1);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
let all_row = mm.iter().find(|r| r.stratifier == "all").unwrap();
assert_eq!(all_row.total_bases, 10, "Orphaned read's 10 bases should be counted");
assert_eq!(all_row.error_bases, 1, "Orphaned read's mismatch should be counted");
}
#[test]
fn test_insertion_low_bq_first_base_excluded() {
let (fasta, ref_seq) = test_reference();
let mut seq = ref_seq[100..130].to_vec(); seq.extend_from_slice(b"AA"); seq.extend_from_slice(&ref_seq[130..174]); assert_eq!(seq.len(), 76);
let mut quals = vec![30u8; 76];
quals[30] = 5;
let mut builder = coord_builder(&[("chr1", 1000)]);
let cigar: Cigar =
[Op::new(Kind::Match, 30), Op::new(Kind::Insertion, 2), Op::new(Kind::Match, 44)]
.into_iter()
.collect();
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error_with_bq(bam.path(), fasta.path(), vec![], 20);
let indels: Vec<IndelMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-indel.txt")).unwrap();
let all_row = indels.iter().find(|r| r.stratifier == "all").unwrap();
assert_eq!(
all_row.num_insertions, 0,
"Insertion should be skipped when first base BQ < min_bq"
);
assert_eq!(all_row.num_inserted_bases, 0);
assert_eq!(all_row.total_bases, 74);
}
#[test]
fn test_overlap_double_counted() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let mut seq1 = ref_seq[100..110].to_vec();
seq1[6] = b'T';
let mut seq2 = ref_seq[105..115].to_vec();
seq2[1] = b'T';
make_pair(&mut builder, "pair1", 0, 101, 106, 15, &seq1, &seq2, &quals);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let ov: Vec<OverlappingMismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-overlap.txt")).unwrap();
let all_row = ov.iter().find(|r| r.stratifier == "all").unwrap();
assert_eq!(
all_row.overlapping_read_bases, 10,
"Both reads should contribute to overlapping_read_bases"
);
assert!(
all_row.bases_matching_mate_but_not_ref > 0,
"Expected bases_matching_mate_but_not_ref > 0, got {}",
all_row.bases_matching_mate_but_not_ref
);
}
#[test]
fn test_max_isize_exclusion() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let small_fwd = ref_seq[100..110].to_vec();
let small_rev = ref_seq[105..115].to_vec();
make_pair(&mut builder, "small_pair", 0, 101, 106, 15, &small_fwd, &small_rev, &quals);
let large_fwd = ref_seq[100..110].to_vec();
let large_rev = ref_seq[590..600].to_vec();
make_pair(&mut builder, "large_pair", 0, 101, 591, 500, &large_fwd, &large_rev, &quals);
let bam = builder.to_temp_indexed_bam().unwrap();
let dir = tempfile::tempdir().unwrap();
let prefix = dir.path().join("out");
let cmd = Error {
input: InputOptions { input: bam.path().to_path_buf() },
output: OutputOptions { output: prefix.clone() },
options: ErrorOptions {
reference: fasta.path().to_path_buf(),
vcf: None,
intervals: None,
min_mapq: 20,
min_bq: 0,
include_duplicates: false,
max_isize: 100,
picard_compat: false,
stratify_by: vec!["all".into(), "isize".into()],
},
};
cmd.execute().expect("error command should succeed");
let path = prefix.clone();
std::mem::forget(dir);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&path.with_file_name("out.error-mismatch.txt")).unwrap();
let all_row = mm.iter().find(|r| r.stratifier == "all").unwrap();
assert_eq!(all_row.total_bases, 40, "All 4 reads should be counted in the 'all' group");
let isize_15 = mm.iter().find(|r| r.stratifier == "isize" && r.covariate == "15");
assert!(isize_15.is_some(), "Expected isize=15 row for the small pair");
let isize_500 = mm.iter().find(|r| r.stratifier == "isize" && r.covariate == "500");
assert!(isize_500.is_none(), "tlen=500 should be excluded when max_isize=100");
}
#[test]
fn test_non_overlapping_pair_no_overlap_metrics() {
let (fasta, ref_seq) = test_reference();
let quals = vec![30u8; 10];
let mut builder = coord_builder(&[("chr1", 1000)]);
let seq1 = ref_seq[100..110].to_vec();
let seq2 = ref_seq[199..209].to_vec();
make_pair(&mut builder, "pair1", 0, 101, 200, 109, &seq1, &seq2, &quals);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let ov: Vec<OverlappingMismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-overlap.txt")).unwrap();
if let Some(all_row) = ov.iter().find(|r| r.stratifier == "all") {
assert_eq!(
all_row.overlapping_read_bases, 0,
"Non-overlapping pair should have 0 overlapping read bases"
);
}
}
#[test]
fn test_insertion_at_read_start_no_anchor() {
let (fasta, ref_seq) = test_reference();
let mut seq = vec![b'A'; 3]; seq.extend_from_slice(&ref_seq[100..150]); let quals = vec![30u8; 53];
let mut builder = coord_builder(&[("chr1", 1000)]);
let cigar: Cigar =
[Op::new(Kind::Insertion, 3), Op::new(Kind::Match, 50)].into_iter().collect();
let record = RecordBuf::builder()
.set_name("read1")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(Sequence::from(seq))
.set_quality_scores(QualityScores::from(quals))
.build();
builder.add_record(record);
let bam = builder.to_temp_indexed_bam().unwrap();
let prefix = run_error(bam.path(), fasta.path(), vec![]);
let indels: Vec<IndelMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-indel.txt")).unwrap();
let all_row = indels.iter().find(|r| r.stratifier == "all").unwrap();
assert_eq!(
all_row.num_insertions, 0,
"Insertion at read start should have no anchor and be skipped"
);
assert_eq!(all_row.total_bases, 50);
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
let mm_all = mm.iter().find(|r| r.stratifier == "all").unwrap();
assert_eq!(mm_all.total_bases, 50, "Aligned M bases should still be counted for mismatches");
}
#[test]
fn test_cross_contig_orphan_not_processed_against_wrong_contig() {
let chr1 = vec![b'A'; 200];
let chr2 = vec![b'G'; 200];
let fasta = FastaBuilder::new()
.add_contig("chr1", &chr1)
.add_contig("chr2", &chr2)
.to_temp_fasta()
.unwrap();
let quals = vec![30u8; 50];
let seq_all_a = vec![b'A'; 50];
let mut builder = coord_builder(&[("chr1", 200), ("chr2", 200)]);
make_pair(&mut builder, "read", 0, 1, 30, 79, &seq_all_a, &seq_all_a, &quals);
let bam = builder.to_temp_indexed_bam().unwrap();
let dir = tempfile::tempdir().unwrap();
let bed_path = dir.path().join("regions.bed");
std::fs::write(&bed_path, "chr1\t0\t15\nchr2\t0\t200\n").unwrap();
let prefix = dir.path().join("out");
let cmd = Error {
input: InputOptions { input: bam.path().to_path_buf() },
output: OutputOptions { output: prefix.clone() },
options: ErrorOptions {
reference: fasta.path().to_path_buf(),
vcf: None,
intervals: Some(bed_path),
min_mapq: 20,
min_bq: 0,
include_duplicates: false,
max_isize: 1000,
picard_compat: false,
stratify_by: vec![],
},
};
cmd.execute().expect("error command should succeed");
let mm: Vec<MismatchMetric> =
read_metrics_tsv(&prefix.with_file_name("out.error-mismatch.txt")).unwrap();
let all_row = mm.iter().find(|r| r.stratifier == "all").unwrap();
assert_eq!(
all_row.error_bases, 0,
"orphan on chr1 must not be compared against chr2's reference bases"
);
}