mod helpers;
use anyhow::Result;
use helpers::{SamBuilder, read_metrics_tsv};
use riker_lib::collector::Collector;
use riker_lib::commands::alignment::{
AlignmentCollector, AlignmentOptions, AlignmentSummaryMetric, METRICS_SUFFIX,
};
use riker_lib::sam::alignment_reader::AlignmentReader;
use tempfile::NamedTempFile;
fn run_alignment(bam: &std::path::Path) -> Result<Vec<AlignmentSummaryMetric>> {
let prefix = NamedTempFile::with_suffix(".alignment")?;
let prefix_path = prefix.path().to_path_buf();
let metrics_path =
std::path::PathBuf::from(format!("{}{METRICS_SUFFIX}", prefix_path.display()));
let mut collector =
AlignmentCollector::new(bam, &prefix_path, None, &AlignmentOptions::default());
let mut reader = AlignmentReader::open(bam, None)?;
let header = reader.header().clone();
collector.initialize(&header)?;
let requirements = collector.field_needs();
for result in reader.riker_records(&requirements) {
let record = result?;
collector.accept(&record, &header)?;
}
collector.finish()?;
read_metrics_tsv::<AlignmentSummaryMetric>(&metrics_path)
}
fn row<'a>(metrics: &'a [AlignmentSummaryMetric], category: &str) -> &'a AlignmentSummaryMetric {
metrics
.iter()
.find(|m| m.category == category)
.unwrap_or_else(|| panic!("no row for category '{category}'"))
}
#[test]
fn test_basic_paired_reads_first_and_second() -> Result<()> {
let mut builder = SamBuilder::new();
for i in 0..5 {
builder.add_pair(
&format!("read{i}"),
0,
100 + i * 200,
300 + i * 200,
200,
60,
100,
false,
false,
);
}
let bam = builder.to_temp_bam()?;
let metrics = run_alignment(bam.path())?;
let first = row(&metrics, "read1");
let second = row(&metrics, "read2");
let pair = row(&metrics, "pair");
assert_eq!(first.total_reads, 5);
assert_eq!(second.total_reads, 5);
assert_eq!(pair.total_reads, 10);
assert_eq!(first.aligned_reads, 5);
assert_eq!(second.aligned_reads, 5);
assert_eq!(pair.aligned_reads, 10);
assert_eq!(first.aligned_reads_in_pairs, 5);
assert_eq!(second.aligned_reads_in_pairs, 5);
assert!((first.frac_aligned - 1.0).abs() < 1e-5);
assert!((first.strand_balance - 1.0).abs() < 1e-5);
assert!((second.strand_balance - 0.0).abs() < 1e-5);
assert!((pair.strand_balance - 0.5).abs() < 1e-5);
assert!((first.mean_read_length - 100.0).abs() < 1e-9);
assert_eq!(first.min_read_length, 100);
assert_eq!(first.max_read_length, 100);
Ok(())
}
#[test]
fn test_no_paired_data_produces_unpaired_row() -> Result<()> {
let builder = SamBuilder::new();
let bam = builder.to_temp_bam()?;
let metrics = run_alignment(bam.path())?;
assert_eq!(metrics.len(), 1);
assert_eq!(metrics[0].category, "read1");
assert_eq!(metrics[0].total_reads, 0);
assert_eq!(metrics[0].aligned_reads, 0);
Ok(())
}
#[test]
fn test_unpaired_reads_category() -> Result<()> {
let mut builder = SamBuilder::new();
for i in 0..4 {
builder.add_unpaired(
&format!("r{i}"),
0,
1000 + i * 100,
60,
75,
i % 2 == 0, false,
false,
None,
);
}
let bam = builder.to_temp_bam()?;
let metrics = run_alignment(bam.path())?;
assert_eq!(metrics.len(), 1);
let unpaired = &metrics[0];
assert_eq!(unpaired.category, "read1");
assert_eq!(unpaired.total_reads, 4);
assert_eq!(unpaired.aligned_reads, 4);
assert!((unpaired.frac_aligned - 1.0).abs() < 1e-5);
assert!((unpaired.strand_balance - 0.5).abs() < 1e-5);
Ok(())
}
#[test]
fn test_qc_fail_counted_in_total_not_stats() -> Result<()> {
let mut builder = SamBuilder::new();
for i in 0..3 {
builder.add_pair(&format!("ok{i}"), 0, 100, 300, 200, 60, 100, false, false);
}
for i in 0..2 {
builder.add_pair(&format!("fail{i}"), 0, 100, 300, 200, 60, 100, false, true);
}
let bam = builder.to_temp_bam()?;
let metrics = run_alignment(bam.path())?;
let first = row(&metrics, "read1");
assert_eq!(first.total_reads, 5);
assert_eq!(first.aligned_reads, 3);
Ok(())
}
#[test]
fn test_hq_threshold_filters_low_mapq() -> Result<()> {
let mut builder = SamBuilder::new();
for i in 0..3 {
builder.add_unpaired(&format!("hq{i}"), 0, 1000, 60, 100, false, false, false, None);
}
for i in 0..2 {
builder.add_unpaired(&format!("lq{i}"), 0, 2000, 10, 100, false, false, false, None);
}
let bam = builder.to_temp_bam()?;
let metrics = run_alignment(bam.path())?;
let unpaired = row(&metrics, "read1");
assert_eq!(unpaired.aligned_reads, 5);
assert_eq!(unpaired.hq_aligned_reads, 3); Ok(())
}
#[test]
fn test_mismatch_rate_from_nm_tag() -> Result<()> {
let mut builder = SamBuilder::new();
for i in 0..2 {
builder.add_pair_with_nm(&format!("r{i}"), 0, 100, 300, 200, 60, 100, 2, 3);
}
let bam = builder.to_temp_bam()?;
let metrics = run_alignment(bam.path())?;
let pair = row(&metrics, "pair");
assert!((pair.mismatch_rate - 0.025).abs() < 1e-6, "mismatch_rate={}", pair.mismatch_rate);
assert!(
(pair.hq_mismatch_rate - 0.025).abs() < 1e-6,
"hq_mismatch_rate={}",
pair.hq_mismatch_rate
);
Ok(())
}
#[test]
fn test_hq_median_mismatches() -> Result<()> {
let mut builder = SamBuilder::new();
for nm in [1u8, 2, 3, 4, 5] {
builder.add_unpaired(&format!("r{nm}"), 0, 1000, 60, 100, false, false, false, Some(nm));
}
let bam = builder.to_temp_bam()?;
let metrics = run_alignment(bam.path())?;
let unpaired = row(&metrics, "read1");
assert!((unpaired.hq_median_mismatches - 3.0).abs() < 0.1, "{}", unpaired.hq_median_mismatches);
Ok(())
}
#[test]
fn test_indel_rate_from_cigar() -> Result<()> {
use noodles::core::Position;
use noodles::sam::alignment::record::{
Flags, MappingQuality,
cigar::{Op, op::Kind},
};
use noodles::sam::alignment::record_buf::{Cigar, QualityScores, Sequence};
let cigar: Cigar = [
Op::new(Kind::Match, 49),
Op::new(Kind::Insertion, 2),
Op::new(Kind::Match, 49),
Op::new(Kind::Deletion, 1),
]
.into_iter()
.collect();
let flags = Flags::empty();
let seq: Sequence = vec![b'A'; 100].into(); let qual = QualityScores::from(vec![30u8; 100]);
let record = noodles::sam::alignment::RecordBuf::builder()
.set_name("indel_read")
.set_flags(flags)
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(100).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(seq)
.set_quality_scores(qual)
.build();
let mut sb = SamBuilder::new();
sb.add_record(record);
let bam = sb.to_temp_bam()?;
let metrics = run_alignment(bam.path())?;
let unpaired = row(&metrics, "read1");
assert!((unpaired.indel_rate - 2.0 / 98.0).abs() < 1e-6, "indel_rate={}", unpaired.indel_rate);
Ok(())
}
#[test]
fn test_chimera_different_contigs() -> Result<()> {
use noodles::core::Position;
use noodles::sam::alignment::record::{
Flags, MappingQuality,
cigar::{Op, op::Kind},
};
use noodles::sam::alignment::record_buf::{Cigar, QualityScores, Sequence};
let mut builder = SamBuilder::with_contigs(&[
("chr1".to_string(), 249_250_621),
("chr2".to_string(), 243_199_373),
]);
let cigar: Cigar = [Op::new(Kind::Match, 100)].into_iter().collect();
let seq: Sequence = vec![b'A'; 100].into();
let qual = QualityScores::from(vec![30u8; 100]);
let r1 = noodles::sam::alignment::RecordBuf::builder()
.set_name("chimera1")
.set_flags(Flags::SEGMENTED | Flags::MATE_REVERSE_COMPLEMENTED | Flags::FIRST_SEGMENT)
.set_reference_sequence_id(0) .set_alignment_start(Position::new(1000).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar.clone())
.set_mate_reference_sequence_id(1) .set_mate_alignment_start(Position::new(2000).unwrap())
.set_template_length(0)
.set_sequence(seq.clone())
.set_quality_scores(qual.clone())
.build();
let r2 = noodles::sam::alignment::RecordBuf::builder()
.set_name("chimera1")
.set_flags(Flags::SEGMENTED | Flags::REVERSE_COMPLEMENTED | Flags::LAST_SEGMENT)
.set_reference_sequence_id(1) .set_alignment_start(Position::new(2000).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_mate_reference_sequence_id(0) .set_mate_alignment_start(Position::new(1000).unwrap())
.set_template_length(0)
.set_sequence(seq)
.set_quality_scores(qual)
.build();
builder.add_record(r1);
builder.add_record(r2);
for i in 0..3 {
builder.add_pair(
&format!("normal{i}"),
0,
100 + i * 200,
300 + i * 200,
200,
60,
100,
false,
false,
);
}
let bam = builder.to_temp_bam()?;
let metrics = run_alignment(bam.path())?;
let pair = row(&metrics, "pair");
assert!(pair.frac_chimeras > 0.0, "frac_chimeras should be > 0");
Ok(())
}
#[test]
fn test_chimera_large_insert() -> Result<()> {
use noodles::core::Position;
use noodles::sam::alignment::record::{
Flags, MappingQuality,
cigar::{Op, op::Kind},
};
use noodles::sam::alignment::record_buf::{Cigar, QualityScores, Sequence};
let mut builder = SamBuilder::new();
let cigar: Cigar = [Op::new(Kind::Match, 100)].into_iter().collect();
let seq: Sequence = vec![b'A'; 100].into();
let qual = QualityScores::from(vec![30u8; 100]);
let r1 = noodles::sam::alignment::RecordBuf::builder()
.set_name("large_insert")
.set_flags(
Flags::SEGMENTED
| Flags::PROPERLY_SEGMENTED
| Flags::MATE_REVERSE_COMPLEMENTED
| Flags::FIRST_SEGMENT,
)
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(1000).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar.clone())
.set_mate_reference_sequence_id(0)
.set_mate_alignment_start(Position::new(200_100).unwrap())
.set_template_length(200_000i32)
.set_sequence(seq.clone())
.set_quality_scores(qual.clone())
.build();
let r2 = noodles::sam::alignment::RecordBuf::builder()
.set_name("large_insert")
.set_flags(
Flags::SEGMENTED
| Flags::PROPERLY_SEGMENTED
| Flags::REVERSE_COMPLEMENTED
| Flags::LAST_SEGMENT,
)
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(200_100).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_mate_reference_sequence_id(0)
.set_mate_alignment_start(Position::new(1000).unwrap())
.set_template_length(-200_000i32)
.set_sequence(seq)
.set_quality_scores(qual)
.build();
builder.add_record(r1);
builder.add_record(r2);
let bam = builder.to_temp_bam()?;
let metrics = run_alignment(bam.path())?;
let pair = row(&metrics, "pair");
assert!((pair.frac_chimeras - 1.0).abs() < 1e-5, "frac_chimeras={}", pair.frac_chimeras);
Ok(())
}
#[test]
fn test_bad_cycles_all_n_at_same_position() -> Result<()> {
use noodles::core::Position;
use noodles::sam::alignment::record::{
Flags, MappingQuality,
cigar::{Op, op::Kind},
};
use noodles::sam::alignment::record_buf::{Cigar, QualityScores, Sequence};
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let mut builder = SamBuilder::new();
for i in 0..5 {
let mut bases = vec![b'A'; 10];
bases[0] = b'N'; let seq: Sequence = bases.into();
let qual = QualityScores::from(vec![30u8; 10]);
let record = noodles::sam::alignment::RecordBuf::builder()
.set_name(format!("r{i}").as_str())
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(100).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar.clone())
.set_sequence(seq)
.set_quality_scores(qual)
.build();
builder.add_record(record);
}
let bam = builder.to_temp_bam()?;
let metrics = run_alignment(bam.path())?;
let unpaired = row(&metrics, "read1");
assert_eq!(unpaired.bad_cycles, 1);
Ok(())
}
#[test]
fn test_bad_cycles_below_threshold_not_counted() -> Result<()> {
use noodles::core::Position;
use noodles::sam::alignment::record::{
Flags, MappingQuality,
cigar::{Op, op::Kind},
};
use noodles::sam::alignment::record_buf::{Cigar, QualityScores, Sequence};
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let mut builder = SamBuilder::new();
for i in 0..10 {
let mut bases = vec![b'A'; 10];
if i < 4 {
bases[0] = b'N';
}
let seq: Sequence = bases.into();
let qual = QualityScores::from(vec![30u8; 10]);
let record = noodles::sam::alignment::RecordBuf::builder()
.set_name(format!("r{i}").as_str())
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(100).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar.clone())
.set_sequence(seq)
.set_quality_scores(qual)
.build();
builder.add_record(record);
}
let bam = builder.to_temp_bam()?;
let metrics = run_alignment(bam.path())?;
let unpaired = row(&metrics, "read1");
assert_eq!(unpaired.bad_cycles, 0);
Ok(())
}
#[test]
fn test_softclip_fraction() -> Result<()> {
use noodles::core::Position;
use noodles::sam::alignment::record::{
Flags, MappingQuality,
cigar::{Op, op::Kind},
};
use noodles::sam::alignment::record_buf::{Cigar, QualityScores, Sequence};
let cigar: Cigar =
[Op::new(Kind::SoftClip, 10), Op::new(Kind::Match, 80), Op::new(Kind::SoftClip, 10)]
.into_iter()
.collect();
let seq: Sequence = vec![b'A'; 100].into();
let qual = QualityScores::from(vec![30u8; 100]);
let record = noodles::sam::alignment::RecordBuf::builder()
.set_name("sc_read")
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(100).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar)
.set_sequence(seq)
.set_quality_scores(qual)
.build();
let mut builder = SamBuilder::new();
builder.add_record(record);
let bam = builder.to_temp_bam()?;
let metrics = run_alignment(bam.path())?;
let unpaired = row(&metrics, "read1");
assert!(
(unpaired.frac_softclipped_reads - 0.2).abs() < 1e-5,
"frac_softclipped_reads={}",
unpaired.frac_softclipped_reads
);
assert!((unpaired.frac_hardclipped_reads - 0.0).abs() < 1e-5);
Ok(())
}
#[test]
fn test_reference_validation_missing_contig_errors() -> Result<()> {
use helpers::FastaBuilder;
let mut builder = SamBuilder::with_contigs(&[
("chr1".to_string(), 249_250_621),
("chrZ".to_string(), 1_000_000), ]);
builder.add_pair("r1", 0, 100, 300, 200, 60, 100, false, false);
let bam = builder.to_temp_bam()?;
let reference = FastaBuilder::new().add_contig("chr1", &vec![b'A'; 1000]).to_temp_fasta()?;
let prefix = NamedTempFile::with_suffix(".alignment")?;
let prefix_path = prefix.path().to_path_buf();
let mut collector = AlignmentCollector::new(
bam.path(),
&prefix_path,
Some(reference.path().to_path_buf()),
&AlignmentOptions::default(),
);
let reader = AlignmentReader::open(bam.path(), None)?;
let result = collector.initialize(reader.header());
assert!(result.is_err(), "expected error for missing contig");
assert!(result.unwrap_err().to_string().contains("chrZ"));
Ok(())
}
#[test]
fn test_output_file_created_with_correct_suffix() -> Result<()> {
let mut builder = SamBuilder::new();
builder.add_unpaired("r1", 0, 100, 60, 100, false, false, false, None);
let bam = builder.to_temp_bam()?;
let prefix = NamedTempFile::with_suffix(".align_test")?;
let prefix_path = prefix.path().to_path_buf();
let expected_metrics =
std::path::PathBuf::from(format!("{}{METRICS_SUFFIX}", prefix_path.display()));
let mut collector =
AlignmentCollector::new(bam.path(), &prefix_path, None, &AlignmentOptions::default());
let mut reader = AlignmentReader::open(bam.path(), None)?;
let header = reader.header().clone();
collector.initialize(&header)?;
let requirements = collector.field_needs();
for result in reader.riker_records(&requirements) {
let record = result?;
collector.accept(&record, &header)?;
}
collector.finish()?;
assert!(expected_metrics.exists(), "metrics file not created at expected path");
Ok(())
}
#[test]
fn test_pair_bad_cycles_is_sum_of_first_and_second() -> Result<()> {
use noodles::core::Position;
use noodles::sam::alignment::record::{
Flags, MappingQuality,
cigar::{Op, op::Kind},
};
use noodles::sam::alignment::record_buf::{Cigar, QualityScores, Sequence};
let cigar: Cigar = [Op::new(Kind::Match, 10)].into_iter().collect();
let mut builder = SamBuilder::new();
for i in 0..5 {
let pos1 = 100 + i * 200;
let pos2 = 300 + i * 200;
let mut seq1 = vec![b'A'; 10];
seq1[0] = b'N';
let mut seq2 = vec![b'A'; 10];
seq2[0] = b'N';
let r1 = noodles::sam::alignment::RecordBuf::builder()
.set_name(format!("r{i}").as_str())
.set_flags(
Flags::SEGMENTED
| Flags::PROPERLY_SEGMENTED
| Flags::MATE_REVERSE_COMPLEMENTED
| Flags::FIRST_SEGMENT,
)
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(pos1).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar.clone())
.set_mate_reference_sequence_id(0)
.set_mate_alignment_start(Position::new(pos2).unwrap())
.set_template_length(200)
.set_sequence(Sequence::from(seq1))
.set_quality_scores(QualityScores::from(vec![30u8; 10]))
.build();
let r2 = noodles::sam::alignment::RecordBuf::builder()
.set_name(format!("r{i}").as_str())
.set_flags(
Flags::SEGMENTED
| Flags::PROPERLY_SEGMENTED
| Flags::REVERSE_COMPLEMENTED
| Flags::LAST_SEGMENT,
)
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(pos2).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar.clone())
.set_mate_reference_sequence_id(0)
.set_mate_alignment_start(Position::new(pos1).unwrap())
.set_template_length(-200)
.set_sequence(Sequence::from(seq2))
.set_quality_scores(QualityScores::from(vec![30u8; 10]))
.build();
builder.add_record(r1);
builder.add_record(r2);
}
let bam = builder.to_temp_bam()?;
let metrics = run_alignment(bam.path())?;
let first = row(&metrics, "read1");
let second = row(&metrics, "read2");
let pair = row(&metrics, "pair");
assert_eq!(pair.bad_cycles, first.bad_cycles + second.bad_cycles);
assert_eq!(first.bad_cycles, 1, "read1 bad_cycles");
assert_eq!(second.bad_cycles, 1, "read2 bad_cycles");
assert_eq!(pair.bad_cycles, 2, "pair bad_cycles");
Ok(())
}
#[test]
fn test_improper_pairs_counted() -> Result<()> {
use noodles::core::Position;
use noodles::sam::alignment::record::{
Flags, MappingQuality,
cigar::{Op, op::Kind},
};
use noodles::sam::alignment::record_buf::{Cigar, QualityScores, Sequence};
let mut builder = SamBuilder::new();
for i in 0..3 {
builder.add_pair(
&format!("ok{i}"),
0,
100 + i * 200,
300 + i * 200,
200,
60,
100,
false,
false,
);
}
let cigar: Cigar = [Op::new(Kind::Match, 100)].into_iter().collect();
let seq: Sequence = vec![b'A'; 100].into();
let qual = QualityScores::from(vec![30u8; 100]);
for i in 0..2 {
let pos1 = 10_000 + i * 500;
let pos2 = 10_200 + i * 500;
let r1 = noodles::sam::alignment::RecordBuf::builder()
.set_name(format!("imp{i}").as_str())
.set_flags(Flags::SEGMENTED | Flags::MATE_REVERSE_COMPLEMENTED | Flags::FIRST_SEGMENT)
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(pos1).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar.clone())
.set_mate_reference_sequence_id(0)
.set_mate_alignment_start(Position::new(pos2).unwrap())
.set_template_length(200)
.set_sequence(seq.clone())
.set_quality_scores(qual.clone())
.build();
let r2 = noodles::sam::alignment::RecordBuf::builder()
.set_name(format!("imp{i}").as_str())
.set_flags(Flags::SEGMENTED | Flags::REVERSE_COMPLEMENTED | Flags::LAST_SEGMENT)
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(pos2).unwrap())
.set_mapping_quality(MappingQuality::new(60).unwrap())
.set_cigar(cigar.clone())
.set_mate_reference_sequence_id(0)
.set_mate_alignment_start(Position::new(pos1).unwrap())
.set_template_length(-200)
.set_sequence(seq.clone())
.set_quality_scores(qual.clone())
.build();
builder.add_record(r1);
builder.add_record(r2);
}
let bam = builder.to_temp_bam()?;
let metrics = run_alignment(bam.path())?;
let pair = row(&metrics, "pair");
assert_eq!(pair.reads_improperly_paired, 4, "reads_improperly_paired");
assert!(
(pair.frac_reads_improperly_paired - 0.4).abs() < 1e-5,
"frac_improper={}",
pair.frac_reads_improperly_paired
);
Ok(())
}