use crate::commands::common::parse_bool;
use crate::logging::OperationTimer;
use crate::progress::ProgressTracker;
use crate::reference::find_dict_path;
use crate::sam::SamTag;
use crate::umi::extract_mi_base;
use crate::validation::validate_file_exists;
use crate::variant_review::{
BaseCounts, ConsensusVariantReviewInfo, Variant, format_insert_string, read_number_suffix,
};
use anyhow::{Result, bail};
use clap::Parser;
use fgumi_raw_bam::{
BAM_BASE_TO_ASCII, CigarKind, IndexedRawBamReader, RawBamReader, RawRecord,
find_string_tag_in_record, write_raw_record,
};
use log::info;
use std::collections::HashSet;
use std::path::{Path, PathBuf};
use super::command::Command;
#[derive(Parser, Debug)]
#[command(
name = "review",
author,
version,
about = "\x1b[38;5;173m[POST-CONSENSUS]\x1b[0m \x1b[36mExtract data to review variant calls from consensus reads\x1b[0m",
long_about = r#"
Extracts data to make reviewing of variant calls from consensus reads easier.
Creates a list of variant sites from the input VCF (SNPs only) or IntervalList then extracts all
the consensus reads that do not contain a reference allele at the variant sites, and all raw reads
that contributed to those consensus reads. This will include consensus reads that carry the
alternate allele, a third allele, a no-call or a spanning deletion at the variant site.
Reads are correlated between consensus and grouped BAMs using a molecule ID stored in an optional
attribute, `MI` by default. In order to support paired molecule IDs where two or more molecule IDs
are related (e.g. see the Paired assignment strategy in `group`) the molecule ID is truncated at
the last `/` if present (e.g. `1/A => 1` and `2 => 2`).
Both input BAMs must be coordinate sorted and indexed.
## Output Files
A pair of output BAMs are created:
- **<output>.consensus.bam**: Contains the relevant consensus reads from the consensus BAM
- **<output>.grouped.bam**: Contains the relevant raw reads from the grouped BAM
A review file **<output>.txt** is also created. The review file contains details on each variant
position along with detailed information on each consensus read that supports the variant. If the
`--sample` argument is supplied and the input is VCF, genotype information for that sample will be
retrieved. If the sample name isn't supplied and the VCF contains only a single sample then those
genotypes will be used.
The `--maf` parameter controls which variants get detailed per-read information in the output file.
Only variants with a minor allele frequency at or below this threshold will have detailed information
written.
"#
)]
pub struct Review {
#[arg(short = 'i', long = "input")]
pub input: PathBuf,
#[arg(short = 'c', long = "consensus-bam")]
pub consensus_bam: PathBuf,
#[arg(short = 'g', long = "grouped-bam")]
pub grouped_bam: PathBuf,
#[arg(short = 'r', long = "ref")]
pub reference: PathBuf,
#[arg(short = 'o', long = "output")]
pub output: PathBuf,
#[arg(short = 's', long = "sample")]
pub sample: Option<String>,
#[arg(short = 'N', long = "ignore-ns", default_value = "false", num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set, value_parser = parse_bool)]
pub ignore_ns: bool,
#[arg(short = 'm', long = "maf", default_value = "0.05")]
pub maf: f64,
}
impl Command for Review {
fn execute(&self, command_line: &str) -> Result<()> {
info!("Review");
info!(" Input: {}", self.input.display());
info!(" Consensus BAM: {}", self.consensus_bam.display());
info!(" Grouped BAM: {}", self.grouped_bam.display());
info!(" Reference: {}", self.reference.display());
info!(" Output prefix: {}", self.output.display());
info!(" Ignore Ns: {}", self.ignore_ns);
info!(" MAF threshold: {}", self.maf);
let timer = OperationTimer::new("Reviewing variants");
validate_file_exists(&self.input, "input file")?;
validate_file_exists(&self.consensus_bam, "consensus BAM")?;
validate_file_exists(&self.grouped_bam, "grouped BAM")?;
validate_file_exists(&self.reference, "reference file")?;
self.validate_reference_files()?;
info!("Loading variants...");
let variants = if self.is_vcf_file(&self.input) {
self.load_variants_from_vcf()?
} else {
self.load_variants_from_intervals()?
};
info!("Loaded {} variant positions", variants.len());
info!("Extracting consensus reads with variants...");
let mi_set = self.extract_consensus_reads(&variants, command_line)?;
info!("Found {} unique molecular identifiers", mi_set.len());
info!("Extracting grouped reads...");
self.extract_grouped_reads(&mi_set, command_line)?;
info!("Creating BAM indexes...");
let consensus_out_path = self.output.with_extension("consensus.bam");
let grouped_out_path = self.output.with_extension("grouped.bam");
use noodles::bam;
use noodles::bam::bai;
let consensus_index = bam::fs::index(&consensus_out_path)?;
let mut consensus_index_writer = bai::io::Writer::new(std::fs::File::create(
consensus_out_path.with_extension("bam.bai"),
)?);
consensus_index_writer.write_index(&consensus_index)?;
let grouped_index = bam::fs::index(&grouped_out_path)?;
let mut grouped_index_writer = bai::io::Writer::new(std::fs::File::create(
grouped_out_path.with_extension("bam.bai"),
)?);
grouped_index_writer.write_index(&grouped_index)?;
info!("Generating detailed review file...");
self.generate_review_file(&variants)?;
info!("Done!");
timer.log_completion(variants.len() as u64);
Ok(())
}
}
impl Review {
fn read_bam_index(path: &Path) -> Result<noodles::bam::bai::Index> {
use noodles::bam;
let bam_bai = path.with_extension("bam.bai");
if bam_bai.exists() {
return Ok(bam::bai::fs::read(&bam_bai)?);
}
let bai = path.with_extension("bai");
if bai.exists() {
return Ok(bam::bai::fs::read(&bai)?);
}
bail!(
"Missing BAM index for {}. Tried {} and {}",
path.display(),
bam_bai.display(),
bai.display()
);
}
fn is_vcf_file(&self, path: &Path) -> bool {
if let Some(ext) = path.extension() {
let ext_str = ext.to_string_lossy().to_lowercase();
ext_str == "vcf" || ext_str == "gz" && path.to_string_lossy().ends_with(".vcf.gz")
} else {
false
}
}
fn validate_reference_files(&self) -> Result<()> {
let fai_path = self.reference.with_extension("fa.fai");
let fai_path_alt = format!("{}.fai", self.reference.display());
let has_fai = fai_path.exists() || std::path::Path::new(&fai_path_alt).exists();
if !has_fai {
bail!(
"The reference file has not been indexed. Please run:\n \
samtools faidx {}",
self.reference.display()
);
}
if find_dict_path(&self.reference).is_none() {
bail!(
"The reference file has no sequence dictionary. Tried:\n \
- {}\n \
- {}.dict\n\
Please run: samtools dict {} -o {}",
self.reference.with_extension("dict").display(),
self.reference.display(),
self.reference.display(),
self.reference.with_extension("dict").display()
);
}
Ok(())
}
fn load_variants_from_vcf(&self) -> Result<Vec<Variant>> {
use noodles::vcf;
let mut reader = vcf::io::reader::Builder::default().build_from_path(&self.input)?;
let header = reader.read_header()?;
let mut variants = Vec::new();
let mut fasta_reader = noodles::fasta::io::indexed_reader::Builder::default()
.build_from_path(&self.reference)?;
let sample_name = if let Some(ref name) = self.sample {
Some(name.clone())
} else if header.sample_names().len() == 1 {
Some(header.sample_names()[0].clone())
} else {
None
};
for result in reader.record_bufs(&header) {
let record = result?;
let chrom = record.reference_sequence_name().to_string();
let pos = usize::from(
record
.variant_start()
.ok_or_else(|| anyhow::anyhow!("Missing variant position"))?,
) as i32;
let ref_base = self.get_reference_base(&mut fasta_reader, &chrom, pos)?;
if record.reference_bases().len() != 1 {
continue;
}
if let Some(ref sname) = sample_name {
if let Some(maf) = Self::calculate_maf(&record, &header, sname) {
if maf > self.maf {
continue; }
}
}
let genotype = if let Some(ref sname) = sample_name {
Self::extract_genotype_for_sample(&record, &header, sname)
} else {
None
};
let filters = Self::extract_filters(&record, &header);
let mut variant = Variant::new(chrom, pos, ref_base);
if let Some(gt) = genotype {
variant = variant.with_genotype(gt);
}
variant = variant.with_filters(filters);
variants.push(variant);
}
Ok(variants)
}
fn load_variants_from_intervals(&self) -> Result<Vec<Variant>> {
use std::fs::File;
use std::io::{BufRead, BufReader};
let file = File::open(&self.input)?;
let reader = BufReader::new(file);
let mut variants = Vec::new();
let mut fasta_reader = noodles::fasta::io::indexed_reader::Builder::default()
.build_from_path(&self.reference)?;
for line in reader.lines() {
let line = line?;
let line = line.trim();
if line.is_empty() || line.starts_with('@') || line.starts_with('#') {
continue;
}
let fields: Vec<&str> = line.split('\t').collect();
if fields.len() < 3 {
continue;
}
let chrom = fields[0].to_string();
let start: i32 = fields[1].parse()?;
let end: i32 = fields[2].parse()?;
for pos in start..=end {
let ref_base = self.get_reference_base(&mut fasta_reader, &chrom, pos)?;
variants.push(Variant::new(chrom.clone(), pos, ref_base));
}
}
Ok(variants)
}
fn get_reference_base(
&self,
reader: &mut noodles::fasta::io::IndexedReader<
noodles::fasta::io::BufReader<std::fs::File>,
>,
chrom: &str,
pos: i32,
) -> Result<char> {
use noodles::core::Position;
let region = noodles::core::Region::new(
chrom,
Position::try_from(pos as usize)?..=Position::try_from(pos as usize)?,
);
let record = reader.query(®ion)?;
let sequence = record.sequence();
if sequence.is_empty() {
Ok('N')
} else {
let base = sequence.as_ref().first().copied().unwrap_or(b'N');
Ok((base as char).to_ascii_uppercase())
}
}
fn extract_genotype_for_sample(
record: &noodles::vcf::variant::RecordBuf,
header: &noodles::vcf::Header,
sample_name: &str,
) -> Option<String> {
use noodles::vcf::variant::record::samples::Sample;
let sample_idx = header.sample_names().iter().position(|s| s == sample_name)?;
let samples = record.samples();
let keys = samples.keys();
let gt_key = "GT";
if let Some(gt_idx) = keys.as_ref().iter().position(|k| k == gt_key) {
if let Some(sample_values) = samples.get_index(sample_idx) {
if let Some(Ok(Some(gt_value))) = sample_values.get_index(header, gt_idx) {
return Some(format!("{gt_value:?}"));
}
}
}
None
}
fn extract_filters(
record: &noodles::vcf::variant::RecordBuf,
header: &noodles::vcf::Header,
) -> String {
use noodles::vcf::variant::record::Filters;
let filters = record.filters();
if filters.is_pass() {
return "PASS".to_string();
}
let mut filter_names: Vec<String> = filters
.iter(header)
.filter_map(std::result::Result::ok)
.map(std::string::ToString::to_string)
.collect();
if filter_names.is_empty() {
"PASS".to_string()
} else {
filter_names.sort();
filter_names.join(",")
}
}
fn calculate_maf(
record: &noodles::vcf::variant::RecordBuf,
header: &noodles::vcf::Header,
sample_name: &str,
) -> Option<f64> {
use noodles::vcf::variant::record::samples::Sample;
let sample_idx = header.sample_names().iter().position(|s| s == sample_name)?;
let samples = record.samples();
let keys = samples.keys();
if let Some(af_idx) = keys.as_ref().iter().position(|k| k == "AF") {
if let Some(sample_values) = samples.get_index(sample_idx) {
if let Some(Ok(Some(value))) = sample_values.get_index(header, af_idx) {
let value_str = format!("{value:?}");
let cleaned = value_str.trim_matches(|c| c == '[' || c == ']' || c == '"');
if let Some(first_val) = cleaned.split(',').next() {
if let Ok(af) = first_val.trim().parse::<f64>() {
return Some(af);
}
}
}
}
}
if let Some(ad_idx) = keys.as_ref().iter().position(|k| k == "AD") {
if let Some(sample_values) = samples.get_index(sample_idx) {
if let Some(Ok(Some(value))) = sample_values.get_index(header, ad_idx) {
let value_str = format!("{value:?}");
let cleaned = value_str.trim_matches(|c| c == '[' || c == ']' || c == '"');
let depths: Vec<i32> =
cleaned.split(',').filter_map(|s| s.trim().parse::<i32>().ok()).collect();
if !depths.is_empty() {
let total: i32 = depths.iter().sum();
if total > 0 {
let ref_depth = depths[0];
return Some(1.0 - (ref_depth as f64 / total as f64));
}
}
}
}
}
None
}
fn extract_consensus_reads(
&self,
variants: &[Variant],
command_line: &str,
) -> Result<HashSet<String>> {
use noodles::bam;
let index = Self::read_bam_index(&self.consensus_bam)?;
let mut reader = IndexedRawBamReader::from_path(&self.consensus_bam, index)?;
let header = reader.read_header()?;
let header = crate::commands::common::add_pg_record(header, command_line)?;
let consensus_out_path = self.output.with_extension("consensus.bam");
let mut writer = bam::io::writer::Builder.build_from_path(&consensus_out_path)?;
writer.write_header(&header)?;
let mut mi_set = HashSet::new();
for variant in variants {
let start = noodles::core::Position::try_from(variant.pos as usize)?;
let region = noodles::core::Region::new(variant.chrom.as_str(), start..=start);
for result in reader.query(&header, ®ion)? {
let record = result?;
if self.has_non_reference_base(&record, variant)? {
if let Some(mi_bytes) = find_string_tag_in_record(&record, &SamTag::MI) {
let mi = std::str::from_utf8(mi_bytes)?;
let mi_base = extract_mi_base(mi).to_string();
mi_set.insert(mi_base);
write_raw_record(writer.get_mut(), &record)?;
}
}
}
}
Ok(mi_set)
}
fn extract_grouped_reads(&self, mi_set: &HashSet<String>, command_line: &str) -> Result<()> {
use noodles::bam;
let mut bam_reader = bam::io::reader::Builder.build_from_path(&self.grouped_bam)?;
let header = bam_reader.read_header()?;
let header = crate::commands::common::add_pg_record(header, command_line)?;
let grouped_out_path = self.output.with_extension("grouped.bam");
let mut writer = bam::io::writer::Builder.build_from_path(&grouped_out_path)?;
writer.write_header(&header)?;
let mut raw_reader = RawBamReader::new(bam_reader.into_inner());
let mut raw_rec = RawRecord::new();
let progress = ProgressTracker::new("Processed grouped reads").with_interval(1_000_000);
loop {
let bytes_read = raw_reader.read_record(&mut raw_rec)?;
if bytes_read == 0 {
break; }
progress.log_if_needed(1);
if let Some(mi_bytes) = find_string_tag_in_record(&raw_rec, &SamTag::MI) {
let mi = std::str::from_utf8(mi_bytes)?;
let mi_base = extract_mi_base(mi);
if mi_set.contains(mi_base) {
write_raw_record(writer.get_mut(), &raw_rec)?;
}
}
}
progress.log_final();
Ok(())
}
fn generate_review_file(&self, variants: &[Variant]) -> Result<()> {
let review_path = self.output.with_extension("txt");
let con_index = Self::read_bam_index(&self.consensus_bam)?;
let mut consensus_reader = IndexedRawBamReader::from_path(&self.consensus_bam, con_index)?;
let consensus_header = consensus_reader.read_header()?;
let grp_index = Self::read_bam_index(&self.grouped_bam)?;
let mut grouped_reader = IndexedRawBamReader::from_path(&self.grouped_bam, grp_index)?;
let grouped_header = grouped_reader.read_header()?;
let mut all_metrics = Vec::new();
for variant in variants {
let start = noodles::core::Position::try_from(variant.pos as usize)?;
let region = noodles::core::Region::new(variant.chrom.as_str(), start..=start);
let mut consensus_reads: Vec<RawRecord> = Vec::new();
for result in consensus_reader.query(&consensus_header, ®ion)? {
consensus_reads.push(result?);
}
let mut consensus_counts = BaseCounts::default();
for record in &consensus_reads {
if let Some(base) = self.get_base_at_position(record, variant.pos)? {
consensus_counts
.add_base(Self::normalize_base_for_variant(base, variant.ref_base));
}
}
for record in consensus_reads {
let read_base = match self.get_base_at_position(&record, variant.pos)? {
Some(b) => Self::normalize_base_for_variant(b, variant.ref_base) as char,
None => {
if self.is_spanning_deletion(&record, variant.pos)? {
'*' } else {
continue; }
}
};
if read_base == variant.ref_base {
continue;
}
if self.ignore_ns && read_base == 'N' {
continue;
}
let read_qual = self.get_quality_at_position(&record, variant.pos)?.unwrap_or(0);
let mi_str = match find_string_tag_in_record(&record, &SamTag::MI) {
Some(mi_bytes) => std::str::from_utf8(mi_bytes)?.to_string(),
None => continue,
};
let mi_base = extract_mi_base(&mi_str);
let read_name = String::from_utf8_lossy(record.read_name()).to_string();
let is_first = record.is_first_segment();
let consensus_read_name = format!("{}{}", read_name, read_number_suffix(is_first));
let insert_string = format_insert_string(&record, &consensus_header);
let raw_counts = {
use std::collections::HashSet;
let start = noodles::core::Position::try_from(variant.pos as usize)?;
let region = noodles::core::Region::new(variant.chrom.as_str(), start..=start);
let mut counts = BaseCounts::default();
let mut seen_reads = HashSet::new();
let expected_read_num =
if consensus_read_name.ends_with("/1") { "/1" } else { "/2" };
for result in grouped_reader.query(&grouped_header, ®ion)? {
let rec = result?;
let grp_mi_str = match find_string_tag_in_record(&rec, &SamTag::MI) {
Some(mi_bytes) => std::str::from_utf8(mi_bytes)?.to_string(),
None => continue,
};
let read_mi_base = extract_mi_base(&grp_mi_str);
if read_mi_base != mi_base {
continue;
}
let is_first = rec.is_first_segment();
let read_num = read_number_suffix(is_first);
if read_num != expected_read_num {
continue;
}
let rn = String::from_utf8_lossy(rec.read_name()).to_string();
if !seen_reads.insert(rn) {
continue;
}
if let Some(base) = self.get_base_at_position(&rec, variant.pos)? {
counts
.add_base(Self::normalize_base_for_variant(base, variant.ref_base));
}
}
counts
};
let info = ConsensusVariantReviewInfo {
chrom: variant.chrom.clone(),
pos: variant.pos,
ref_allele: variant.ref_base.to_string(),
genotype: variant.genotype.clone().unwrap_or_else(|| "NA".to_string()),
filters: variant.filters.clone().unwrap_or_else(|| "NA".to_string()),
consensus_a: consensus_counts.a,
consensus_c: consensus_counts.c,
consensus_g: consensus_counts.g,
consensus_t: consensus_counts.t,
consensus_n: consensus_counts.n,
consensus_read: consensus_read_name,
consensus_insert: insert_string,
consensus_call: read_base,
consensus_qual: read_qual,
a: raw_counts.a,
c: raw_counts.c,
g: raw_counts.g,
t: raw_counts.t,
n: raw_counts.n,
};
all_metrics.push(info);
}
}
let mut writer = csv::WriterBuilder::new().delimiter(b'\t').from_path(&review_path)?;
for metric in all_metrics {
writer.serialize(&metric)?;
}
writer.flush()?;
info!("Wrote review metrics to {}", review_path.display());
Ok(())
}
fn get_base_at_position(&self, record: &RawRecord, ref_pos: i32) -> Result<Option<u8>> {
if let Some(offset) = self.calculate_read_offset(record, ref_pos)? {
let l_seq = record.l_seq() as usize;
if offset < l_seq {
let code = record.get_base(offset) as usize;
return Ok(Some(BAM_BASE_TO_ASCII[code]));
}
}
Ok(None)
}
fn normalize_base_for_variant(base: u8, ref_base: char) -> u8 {
if base == b'=' { ref_base.to_ascii_uppercase() as u8 } else { base.to_ascii_uppercase() }
}
fn get_quality_at_position(&self, record: &RawRecord, ref_pos: i32) -> Result<Option<u8>> {
if let Some(offset) = self.calculate_read_offset(record, ref_pos)? {
let qual = record.quality_scores();
if offset < qual.len() {
return Ok(Some(qual[offset]));
}
}
Ok(None)
}
fn has_non_reference_base(&self, record: &RawRecord, variant: &Variant) -> Result<bool> {
if record.is_unmapped() {
return Ok(false);
}
let start = match record.alignment_start_1based() {
Some(pos) => pos as i32,
None => return Ok(false),
};
let end = match record.alignment_end_1based() {
Some(pos) => pos as i32,
None => return Ok(false),
};
if variant.pos < start || variant.pos > end {
return Ok(false);
}
let read_offset = self.calculate_read_offset(record, variant.pos)?;
if let Some(offset) = read_offset {
let l_seq = record.l_seq() as usize;
if offset < l_seq {
let code = record.get_base(offset) as usize;
let base_char =
Self::normalize_base_for_variant(BAM_BASE_TO_ASCII[code], variant.ref_base)
as char;
if base_char != variant.ref_base {
if self.ignore_ns && base_char == 'N' {
return Ok(false);
}
return Ok(true);
}
}
} else {
if self.is_spanning_deletion(record, variant.pos)? {
return Ok(true); }
}
Ok(false)
}
fn is_spanning_deletion(&self, record: &RawRecord, ref_pos: i32) -> Result<bool> {
let start = match record.alignment_start_1based() {
Some(pos) => pos as i32,
None => return Ok(false),
};
let end = match record.alignment_end_1based() {
Some(pos) => pos as i32,
None => return Ok(false),
};
if ref_pos < start || ref_pos > end {
return Ok(false); }
let mut current_ref_pos = start;
for op in record.cigar_ops_typed() {
let len = op.len() as i32;
match op.kind() {
CigarKind::Match | CigarKind::SequenceMatch | CigarKind::SequenceMismatch => {
current_ref_pos += len;
}
CigarKind::Deletion => {
if ref_pos >= current_ref_pos && ref_pos < current_ref_pos + len {
return Ok(true);
}
current_ref_pos += len;
}
CigarKind::Skip => {
current_ref_pos += len;
}
CigarKind::Insertion
| CigarKind::SoftClip
| CigarKind::HardClip
| CigarKind::Pad => {
}
}
}
Ok(false)
}
fn calculate_read_offset(&self, record: &RawRecord, ref_pos: i32) -> Result<Option<usize>> {
let start = match record.alignment_start_1based() {
Some(pos) => pos as i32,
None => return Ok(None),
};
let mut current_ref_pos = start;
let mut current_read_pos: usize = 0;
for op in record.cigar_ops_typed() {
let len = op.len() as usize;
match op.kind() {
CigarKind::Match | CigarKind::SequenceMatch | CigarKind::SequenceMismatch => {
if current_ref_pos + len as i32 > ref_pos {
let offset_in_op = (ref_pos - current_ref_pos) as usize;
return Ok(Some(current_read_pos + offset_in_op));
}
current_ref_pos += len as i32;
current_read_pos += len;
}
CigarKind::Insertion | CigarKind::SoftClip => {
current_read_pos += len;
}
CigarKind::Deletion | CigarKind::Skip => {
if current_ref_pos + len as i32 > ref_pos {
return Ok(None);
}
current_ref_pos += len as i32;
}
CigarKind::HardClip | CigarKind::Pad => {
}
}
}
Ok(None)
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
use tempfile::TempDir;
mod test_utils {
use super::*;
use fgumi_raw_bam::{
SamBuilder as RawSamBuilder, flags, raw_record_to_record_buf, testutil::encode_op,
};
use noodles::sam::Header;
use noodles::sam::alignment::RecordBuf;
use noodles::sam::header::record::value::{Map, map::ReferenceSequence};
use std::num::NonZeroUsize;
pub fn create_test_reference(dir: &TempDir) -> PathBuf {
let ref_path = dir.path().join("ref.fa");
let fai_path = dir.path().join("ref.fa.fai");
let dict_path = dir.path().join("ref.dict");
let fasta_content = b">chr1\n\
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n\
>chr2\n\
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC\n";
std::fs::write(&ref_path, fasta_content).expect("failed to write file");
let fai_content = b"chr1\t100\t6\t100\t101\nchr2\t100\t114\t100\t101\n";
std::fs::write(&fai_path, fai_content).expect("failed to write file");
let dict_content = b"@HD\tVN:1.5\tSO:unsorted\n\
@SQ\tSN:chr1\tLN:100\n\
@SQ\tSN:chr2\tLN:100\n";
std::fs::write(&dict_path, dict_content).expect("failed to write file");
ref_path
}
pub fn create_test_header() -> Header {
use bstr::BString;
let mut header = Header::builder()
.add_reference_sequence(
BString::from("chr1"),
Map::<ReferenceSequence>::new(
NonZeroUsize::try_from(100).expect("non-zero value required"),
),
)
.add_reference_sequence(
BString::from("chr2"),
Map::<ReferenceSequence>::new(
NonZeroUsize::try_from(100).expect("non-zero value required"),
),
)
.build();
let header_str = "@HD\tVN:1.6\tSO:coordinate\n";
let parsed_header: Header = header_str.parse().expect("valid parse input");
if let Some(hd) = parsed_header.header() {
header = Header::builder()
.set_header(hd.clone())
.add_reference_sequence(
BString::from("chr1"),
Map::<ReferenceSequence>::new(
NonZeroUsize::try_from(100).expect("non-zero value required"),
),
)
.add_reference_sequence(
BString::from("chr2"),
Map::<ReferenceSequence>::new(
NonZeroUsize::try_from(100).expect("non-zero value required"),
),
)
.build();
}
header
}
fn to_record_buf(raw: fgumi_raw_bam::RawRecord) -> RecordBuf {
raw_record_to_record_buf(&raw, &Header::default())
.expect("raw_record_to_record_buf failed in test")
}
fn parse_cigar_to_ops(cigar_str: &str) -> Vec<u32> {
let mut ops = Vec::new();
let mut len_buf = String::new();
for ch in cigar_str.chars() {
if ch.is_ascii_digit() {
len_buf.push(ch);
} else {
let len: usize = len_buf.parse().expect("valid cigar length");
len_buf.clear();
let op_code: u32 = match ch {
'M' => 0,
'I' => 1,
'D' => 2,
'N' => 3,
'S' => 4,
'H' => 5,
'P' => 6,
'=' => 7,
'X' => 8,
other => panic!("unknown CIGAR op: {other}"),
};
ops.push(encode_op(op_code, len));
}
}
ops
}
#[allow(clippy::too_many_arguments, clippy::cast_possible_truncation)]
pub fn create_read_pair(
name: &str,
chrom_idx: usize,
start1: i32,
start2: i32,
bases1: &[u8],
bases2: &[u8],
mi: &str,
cigar1: Option<&str>,
) -> (RecordBuf, RecordBuf) {
let cigar1_str = cigar1.map_or_else(|| format!("{}M", bases1.len()), String::from);
let cigar1_ops = parse_cigar_to_ops(&cigar1_str);
let cigar2_ops = [encode_op(0, bases2.len())];
let mut b1 = RawSamBuilder::new();
b1.read_name(name.as_bytes())
.flags(flags::PAIRED | flags::FIRST_SEGMENT)
.ref_id(chrom_idx as i32)
.pos(start1 - 1)
.mapq(60)
.cigar_ops(&cigar1_ops)
.sequence(bases1)
.qualities(&vec![45u8; bases1.len()])
.mate_ref_id(chrom_idx as i32)
.mate_pos(start2 - 1);
b1.add_string_tag(b"MI", mi.as_bytes());
let r1 = to_record_buf(b1.build());
let mut b2 = RawSamBuilder::new();
b2.read_name(name.as_bytes())
.flags(flags::PAIRED | flags::LAST_SEGMENT | flags::REVERSE)
.ref_id(chrom_idx as i32)
.pos(start2 - 1)
.mapq(60)
.cigar_ops(&cigar2_ops)
.sequence(bases2)
.qualities(&vec![45u8; bases2.len()])
.mate_ref_id(chrom_idx as i32)
.mate_pos(start1 - 1);
b2.add_string_tag(b"MI", mi.as_bytes());
let r2 = to_record_buf(b2.build());
(r1, r2)
}
pub fn create_test_bams(dir: &TempDir) -> (PathBuf, PathBuf) {
use noodles::bam;
use noodles::sam::alignment::io::Write;
let raw_path = dir.path().join("raw.bam");
let consensus_path = dir.path().join("consensus.bam");
let header = create_test_header();
let mut raw_writer = bam::io::Writer::new(
std::fs::File::create(&raw_path).expect("failed to create file"),
);
raw_writer.write_header(&header).expect("failed to write BAM header");
let mut con_writer = bam::io::Writer::new(
std::fs::File::create(&consensus_path).expect("failed to create file"),
);
con_writer.write_header(&header).expect("failed to write BAM header");
let (r1, r2) =
create_read_pair("A1", 0, 6, 50, b"AAAATAAAAA", b"AAAAAAAAAA", "A", None);
raw_writer.write_alignment_record(&header, &r1).expect("failed to write BAM record");
raw_writer.write_alignment_record(&header, &r2).expect("failed to write BAM record");
let (r1, r2) =
create_read_pair("A2", 0, 6, 50, b"AAAATAAAAG", b"AAAAAAAAAA", "A", None);
raw_writer.write_alignment_record(&header, &r1).expect("failed to write BAM record");
raw_writer.write_alignment_record(&header, &r2).expect("failed to write BAM record");
let (r1, r2) = create_read_pair("A", 0, 6, 50, b"AAAATAAAAN", b"AAAAAAAAAA", "A", None);
con_writer.write_alignment_record(&header, &r1).expect("failed to write BAM record");
con_writer.write_alignment_record(&header, &r2).expect("failed to write BAM record");
let (r1, r2) =
create_read_pair("B1", 0, 16, 50, b"AAAACAAAAA", b"AAAAAAAAAA", "B", None);
raw_writer.write_alignment_record(&header, &r1).expect("failed to write BAM record");
raw_writer.write_alignment_record(&header, &r2).expect("failed to write BAM record");
let (r1, r2) =
create_read_pair("B", 0, 16, 50, b"AAAACAAAAA", b"AAAAAAAAAA", "B", None);
con_writer.write_alignment_record(&header, &r1).expect("failed to write BAM record");
con_writer.write_alignment_record(&header, &r2).expect("failed to write BAM record");
let (r1, r2) =
create_read_pair("C1", 0, 17, 50, b"AAAAAAAAAA", b"AAAAAAAAAA", "C", None);
raw_writer.write_alignment_record(&header, &r1).expect("failed to write BAM record");
raw_writer.write_alignment_record(&header, &r2).expect("failed to write BAM record");
let (r1, r2) =
create_read_pair("C", 0, 17, 50, b"AAAAAAAAAA", b"AAAAAAAAAA", "C", None);
con_writer.write_alignment_record(&header, &r1).expect("failed to write BAM record");
con_writer.write_alignment_record(&header, &r2).expect("failed to write BAM record");
let (r1, r2) = create_read_pair(
"D1",
0,
25,
60,
b"AAAAAAAAAA",
b"AAAAAAAAAA",
"D",
Some("4M4D6M"),
);
raw_writer.write_alignment_record(&header, &r1).expect("failed to write BAM record");
raw_writer.write_alignment_record(&header, &r2).expect("failed to write BAM record");
let (r1, r2) =
create_read_pair("D", 0, 25, 60, b"AAAAAAAAAA", b"AAAAAAAAAA", "D", Some("4M4D6M"));
con_writer.write_alignment_record(&header, &r1).expect("failed to write BAM record");
con_writer.write_alignment_record(&header, &r2).expect("failed to write BAM record");
let (r1, r2) =
create_read_pair("E1", 0, 26, 60, b"AAAANAAAAA", b"AAAAAAAAAA", "E", None);
raw_writer.write_alignment_record(&header, &r1).expect("failed to write BAM record");
raw_writer.write_alignment_record(&header, &r2).expect("failed to write BAM record");
let (r1, r2) =
create_read_pair("E", 0, 26, 60, b"AAAANAAAAA", b"AAAAAAAAAA", "E", None);
con_writer.write_alignment_record(&header, &r1).expect("failed to write BAM record");
con_writer.write_alignment_record(&header, &r2).expect("failed to write BAM record");
let (r1, r2) =
create_read_pair("F1", 0, 27, 60, b"AAAGAAAAAA", b"AAAAAAAAAA", "F", None);
raw_writer.write_alignment_record(&header, &r1).expect("failed to write BAM record");
raw_writer.write_alignment_record(&header, &r2).expect("failed to write BAM record");
let (r1, r2) =
create_read_pair("F", 0, 27, 60, b"AAAGAAAAAA", b"AAAAAAAAAA", "F", None);
con_writer.write_alignment_record(&header, &r1).expect("failed to write BAM record");
con_writer.write_alignment_record(&header, &r2).expect("failed to write BAM record");
let (r1, r2) =
create_read_pair("H1", 1, 15, 19, b"CCCCCTCCCC", b"CTCCCCCCCC", "H", None);
raw_writer.write_alignment_record(&header, &r1).expect("failed to write BAM record");
raw_writer.write_alignment_record(&header, &r2).expect("failed to write BAM record");
let (r1, r2) =
create_read_pair("H", 1, 15, 19, b"CCCCCTCCCC", b"CTCCCCCCCC", "H", None);
con_writer.write_alignment_record(&header, &r1).expect("failed to write BAM record");
con_writer.write_alignment_record(&header, &r2).expect("failed to write BAM record");
drop(raw_writer);
drop(con_writer);
use noodles::bam::bai;
let raw_index_path = raw_path.with_extension("bam.bai");
let raw_index = bam::fs::index(&raw_path).expect("failed to index BAM file");
let mut raw_index_writer = bai::io::Writer::new(
std::fs::File::create(&raw_index_path).expect("failed to create file"),
);
raw_index_writer.write_index(&raw_index).expect("failed to write BAM index");
let consensus_index_path = consensus_path.with_extension("bam.bai");
let con_index = bam::fs::index(&consensus_path).expect("failed to index BAM file");
let mut con_index_writer = bai::io::Writer::new(
std::fs::File::create(&consensus_index_path).expect("failed to create file"),
);
con_index_writer.write_index(&con_index).expect("failed to write BAM index");
(raw_path, consensus_path)
}
pub fn create_test_vcf(dir: &TempDir) -> PathBuf {
let vcf_path = dir.path().join("variants.vcf");
let vcf_content = b"##fileformat=VCFv4.2\n\
##contig=<ID=chr1,length=100>\n\
##contig=<ID=chr2,length=100>\n\
##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n\
##FORMAT=<ID=AF,Number=1,Type=Float,Description=\"Allele Frequency\">\n\
##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allele Depth\">\n\
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttumor\n\
chr1\t10\t.\tA\tT\t.\tPASS\t.\tGT:AF\t0/1:0.01\n\
chr1\t20\t.\tA\tC\t.\tPASS\t.\tGT:AF\t0/1:0.01\n\
chr1\t30\t.\tA\tG\t.\tPASS\t.\tGT:AF\t0/1:0.01\n\
chr2\t20\t.\tC\tT\t.\tPASS\t.\tGT:AD\t0/1:100,2\n";
std::fs::write(&vcf_path, vcf_content).expect("failed to write file");
vcf_path
}
pub fn create_empty_vcf(dir: &TempDir) -> PathBuf {
let vcf_path = dir.path().join("empty.vcf");
let vcf_content = b"##fileformat=VCFv4.2\n\
##contig=<ID=chr1,length=100>\n\
##contig=<ID=chr2,length=100>\n\
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttumor\n";
std::fs::write(&vcf_path, vcf_content).expect("failed to write file");
vcf_path
}
pub fn create_empty_interval_list(dir: &TempDir) -> PathBuf {
let interval_path = dir.path().join("empty.interval_list");
let interval_content = b"@HD\tVN:1.5\tSO:unsorted\n\
@SQ\tSN:chr1\tLN:100\n\
@SQ\tSN:chr2\tLN:100\n";
std::fs::write(&interval_path, interval_content).expect("failed to write file");
interval_path
}
}
#[test]
fn test_default_review_parameters() {
let review = Review {
input: PathBuf::from("variants.vcf"),
consensus_bam: PathBuf::from("consensus.bam"),
grouped_bam: PathBuf::from("grouped.bam"),
reference: PathBuf::from("ref.fa"),
output: PathBuf::from("output"),
sample: None,
ignore_ns: false,
maf: 0.05,
};
assert!((review.maf - 0.05).abs() < f64::EPSILON);
assert!(!review.ignore_ns);
assert!(review.sample.is_none());
}
#[test]
fn test_review_with_sample_name() {
let review = Review {
input: PathBuf::from("variants.vcf"),
consensus_bam: PathBuf::from("consensus.bam"),
grouped_bam: PathBuf::from("grouped.bam"),
reference: PathBuf::from("ref.fa"),
output: PathBuf::from("output"),
sample: Some("SAMPLE1".to_string()),
ignore_ns: false,
maf: 0.05,
};
assert_eq!(review.sample, Some("SAMPLE1".to_string()));
}
#[test]
fn test_review_ignore_ns_enabled() {
let review = Review {
input: PathBuf::from("variants.vcf"),
consensus_bam: PathBuf::from("consensus.bam"),
grouped_bam: PathBuf::from("grouped.bam"),
reference: PathBuf::from("ref.fa"),
output: PathBuf::from("output"),
sample: None,
ignore_ns: true,
maf: 0.05,
};
assert!(review.ignore_ns);
}
#[test]
fn test_review_custom_maf_threshold() {
let review = Review {
input: PathBuf::from("variants.vcf"),
consensus_bam: PathBuf::from("consensus.bam"),
grouped_bam: PathBuf::from("grouped.bam"),
reference: PathBuf::from("ref.fa"),
output: PathBuf::from("output"),
sample: None,
ignore_ns: false,
maf: 0.01,
};
assert!((review.maf - 0.01).abs() < f64::EPSILON);
}
#[test]
fn test_is_vcf_file_detection() {
let review = Review {
input: PathBuf::from("test.vcf"),
consensus_bam: PathBuf::from("consensus.bam"),
grouped_bam: PathBuf::from("grouped.bam"),
reference: PathBuf::from("ref.fa"),
output: PathBuf::from("output"),
sample: None,
ignore_ns: false,
maf: 0.05,
};
assert!(review.is_vcf_file(&PathBuf::from("variants.vcf")));
assert!(review.is_vcf_file(&PathBuf::from("variants.vcf.gz")));
assert!(!review.is_vcf_file(&PathBuf::from("intervals.bed")));
assert!(!review.is_vcf_file(&PathBuf::from("file.txt")));
}
#[test]
fn test_review_output_path_extensions() {
let review = Review {
input: PathBuf::from("variants.vcf"),
consensus_bam: PathBuf::from("consensus.bam"),
grouped_bam: PathBuf::from("grouped.bam"),
reference: PathBuf::from("ref.fa"),
output: PathBuf::from("/path/to/output"),
sample: None,
ignore_ns: false,
maf: 0.05,
};
let consensus_path = review.output.with_extension("consensus.bam");
let grouped_path = review.output.with_extension("grouped.bam");
let review_path = review.output.with_extension("txt");
assert_eq!(consensus_path, PathBuf::from("/path/to/output.consensus.bam"));
assert_eq!(grouped_path, PathBuf::from("/path/to/output.grouped.bam"));
assert_eq!(review_path, PathBuf::from("/path/to/output.txt"));
}
#[test]
fn test_empty_vcf_produces_empty_outputs() {
use noodles::bam;
let temp_dir = TempDir::new().expect("failed to create temp dir");
let ref_path = test_utils::create_test_reference(&temp_dir);
let (raw_path, consensus_path) = test_utils::create_test_bams(&temp_dir);
let vcf_path = test_utils::create_empty_vcf(&temp_dir);
let output_path = temp_dir.path().join("output");
let review = Review {
input: vcf_path,
consensus_bam: consensus_path,
grouped_bam: raw_path,
reference: ref_path,
output: output_path.clone(),
sample: None,
ignore_ns: false,
maf: 0.05,
};
review.execute("test").expect("execute should succeed");
let con_out = output_path.with_extension("consensus.bam");
let raw_out = output_path.with_extension("grouped.bam");
let txt_out = output_path.with_extension("txt");
assert!(con_out.exists());
assert!(raw_out.exists());
assert!(txt_out.exists());
let mut con_reader = bam::io::indexed_reader::Builder::default()
.build_from_path(&con_out)
.expect("failed to open indexed BAM");
let con_header = con_reader.read_header().expect("failed to read BAM header");
let mut con_record = noodles::sam::alignment::RecordBuf::default();
assert_eq!(
con_reader
.read_record_buf(&con_header, &mut con_record)
.expect("failed to read BAM record"),
0
);
let mut raw_reader = bam::io::indexed_reader::Builder::default()
.build_from_path(&raw_out)
.expect("failed to open indexed BAM");
let raw_header = raw_reader.read_header().expect("failed to read BAM header");
let mut raw_record = noodles::sam::alignment::RecordBuf::default();
assert_eq!(
raw_reader
.read_record_buf(&raw_header, &mut raw_record)
.expect("failed to read BAM record"),
0
);
}
#[test]
fn test_empty_interval_list_produces_empty_outputs() {
use noodles::bam;
let temp_dir = TempDir::new().expect("failed to create temp dir");
let ref_path = test_utils::create_test_reference(&temp_dir);
let (raw_path, consensus_path) = test_utils::create_test_bams(&temp_dir);
let interval_path = test_utils::create_empty_interval_list(&temp_dir);
let output_path = temp_dir.path().join("output");
let review = Review {
input: interval_path,
consensus_bam: consensus_path,
grouped_bam: raw_path,
reference: ref_path,
output: output_path.clone(),
sample: None,
ignore_ns: false,
maf: 0.05,
};
review.execute("test").expect("execute should succeed");
let con_out = output_path.with_extension("consensus.bam");
let raw_out = output_path.with_extension("grouped.bam");
assert!(con_out.exists());
assert!(raw_out.exists());
let mut con_reader = bam::io::indexed_reader::Builder::default()
.build_from_path(&con_out)
.expect("failed to open indexed BAM");
let con_header = con_reader.read_header().expect("failed to read BAM header");
let mut con_record = noodles::sam::alignment::RecordBuf::default();
assert_eq!(
con_reader
.read_record_buf(&con_header, &mut con_record)
.expect("failed to read BAM record"),
0
);
}
#[test]
fn test_extracts_correct_reads_for_variants() {
use noodles::bam;
let temp_dir = TempDir::new().expect("failed to create temp dir");
let ref_path = test_utils::create_test_reference(&temp_dir);
let (raw_path, consensus_path) = test_utils::create_test_bams(&temp_dir);
let vcf_path = test_utils::create_test_vcf(&temp_dir);
let output_path = temp_dir.path().join("output");
let review = Review {
input: vcf_path,
consensus_bam: consensus_path,
grouped_bam: raw_path,
reference: ref_path,
output: output_path.clone(),
sample: Some("tumor".to_string()),
ignore_ns: false,
maf: 0.05,
};
review.execute("test").expect("execute should succeed");
let con_out = output_path.with_extension("consensus.bam");
let raw_out = output_path.with_extension("grouped.bam");
let txt_out = output_path.with_extension("txt");
assert!(con_out.exists());
assert!(raw_out.exists());
assert!(txt_out.exists());
let mut con_reader = bam::io::indexed_reader::Builder::default()
.build_from_path(&con_out)
.expect("failed to open indexed BAM");
let con_header = con_reader.read_header().expect("failed to read BAM header");
let mut consensus_reads = Vec::new();
let mut con_record = noodles::sam::alignment::RecordBuf::default();
while con_reader
.read_record_buf(&con_header, &mut con_record)
.expect("failed to read BAM record")
> 0
{
let name = String::from_utf8_lossy(
con_record.name().expect("record should have name").as_ref(),
)
.to_string();
consensus_reads.push(name);
}
assert!(consensus_reads.contains(&"A".to_string()));
assert!(consensus_reads.contains(&"B".to_string()));
assert!(consensus_reads.contains(&"E".to_string()));
assert!(consensus_reads.contains(&"F".to_string()));
assert!(consensus_reads.contains(&"H".to_string()));
let mut raw_reader = bam::io::indexed_reader::Builder::default()
.build_from_path(&raw_out)
.expect("failed to open indexed BAM");
let raw_header = raw_reader.read_header().expect("failed to read BAM header");
let mut raw_reads = Vec::new();
let mut raw_record = noodles::sam::alignment::RecordBuf::default();
while raw_reader
.read_record_buf(&raw_header, &mut raw_record)
.expect("failed to read BAM record")
> 0
{
let name = String::from_utf8_lossy(
raw_record.name().expect("record should have name").as_ref(),
)
.to_string();
raw_reads.push(name);
}
assert!(raw_reads.contains(&"A1".to_string()));
assert!(raw_reads.contains(&"A2".to_string()));
assert!(raw_reads.contains(&"B1".to_string()));
assert!(raw_reads.contains(&"E1".to_string()));
assert!(raw_reads.contains(&"F1".to_string()));
assert!(raw_reads.contains(&"H1".to_string()));
}
#[test]
fn test_review_tsv_contains_correct_information() {
use std::io::BufRead;
let temp_dir = TempDir::new().expect("failed to create temp dir");
let ref_path = test_utils::create_test_reference(&temp_dir);
let (raw_path, consensus_path) = test_utils::create_test_bams(&temp_dir);
let vcf_path = test_utils::create_test_vcf(&temp_dir);
let output_path = temp_dir.path().join("output");
let review = Review {
input: vcf_path,
consensus_bam: consensus_path,
grouped_bam: raw_path,
reference: ref_path,
output: output_path.clone(),
sample: Some("tumor".to_string()),
ignore_ns: false,
maf: 0.05,
};
review.execute("test").expect("execute should succeed");
let txt_out = output_path.with_extension("txt");
assert!(txt_out.exists());
let file = std::fs::File::open(&txt_out).expect("failed to open file");
let reader = std::io::BufReader::new(file);
let lines: Vec<String> = reader.lines().map(|l| l.expect("failed to read line")).collect();
assert!(lines.len() > 1, "TSV should have header and data rows");
let header = &lines[0];
assert!(header.contains("chrom"));
assert!(header.contains("pos"));
assert!(header.contains("ref_allele"));
assert!(header.contains("consensus_read"));
assert!(header.contains("consensus_call"));
}
#[test]
fn test_spanning_deletions_handled_correctly() {
use noodles::bam;
let temp_dir = TempDir::new().expect("failed to create temp dir");
let ref_path = test_utils::create_test_reference(&temp_dir);
let (raw_path, consensus_path) = test_utils::create_test_bams(&temp_dir);
let vcf_path = test_utils::create_test_vcf(&temp_dir);
let output_path = temp_dir.path().join("output");
let review = Review {
input: vcf_path,
consensus_bam: consensus_path,
grouped_bam: raw_path,
reference: ref_path,
output: output_path.clone(),
sample: Some("tumor".to_string()),
ignore_ns: false,
maf: 0.05,
};
review.execute("test").expect("execute should succeed");
let con_out = output_path.with_extension("consensus.bam");
let mut con_reader = bam::io::indexed_reader::Builder::default()
.build_from_path(&con_out)
.expect("failed to open indexed BAM");
let con_header = con_reader.read_header().expect("failed to read BAM header");
let mut consensus_reads = Vec::new();
let mut con_record = noodles::sam::alignment::RecordBuf::default();
while con_reader
.read_record_buf(&con_header, &mut con_record)
.expect("failed to read BAM record")
> 0
{
let name = String::from_utf8_lossy(
con_record.name().expect("record should have name").as_ref(),
)
.to_string();
consensus_reads.push(name);
}
assert!(consensus_reads.contains(&"D".to_string()));
}
#[test]
fn test_no_calls_handled_correctly() {
let temp_dir = TempDir::new().expect("failed to create temp dir");
let ref_path = test_utils::create_test_reference(&temp_dir);
let (raw_path, consensus_path) = test_utils::create_test_bams(&temp_dir);
let vcf_path = test_utils::create_test_vcf(&temp_dir);
let output_path = temp_dir.path().join("output");
let review = Review {
input: vcf_path.clone(),
consensus_bam: consensus_path.clone(),
grouped_bam: raw_path.clone(),
reference: ref_path.clone(),
output: output_path.clone(),
sample: Some("tumor".to_string()),
ignore_ns: false, maf: 0.05,
};
review.execute("test").expect("execute should succeed");
let txt_out = output_path.with_extension("txt");
let content = std::fs::read_to_string(&txt_out).expect("failed to read file");
assert!(content.contains("E/"), "Should contain consensus read E");
let output_path2 = temp_dir.path().join("output2");
let review2 = Review {
input: vcf_path,
consensus_bam: consensus_path,
grouped_bam: raw_path,
reference: ref_path,
output: output_path2.clone(),
sample: Some("tumor".to_string()),
ignore_ns: true, maf: 0.05,
};
review2.execute("test").expect("execute should succeed");
let txt_out2 = output_path2.with_extension("txt");
let content2 = std::fs::read_to_string(&txt_out2).expect("failed to read file");
let lines: Vec<&str> = content2.lines().collect();
let e_count = lines.iter().filter(|l| l.contains("E/")).count();
assert_eq!(e_count, 0, "Should not contain consensus read E when ignoring Ns");
}
#[test]
fn test_both_ends_overlapping_variant() {
use noodles::bam;
let temp_dir = TempDir::new().expect("failed to create temp dir");
let ref_path = test_utils::create_test_reference(&temp_dir);
let (raw_path, consensus_path) = test_utils::create_test_bams(&temp_dir);
let vcf_path = test_utils::create_test_vcf(&temp_dir);
let output_path = temp_dir.path().join("output");
let review = Review {
input: vcf_path,
consensus_bam: consensus_path,
grouped_bam: raw_path,
reference: ref_path,
output: output_path.clone(),
sample: Some("tumor".to_string()),
ignore_ns: false,
maf: 0.05,
};
review.execute("test").expect("execute should succeed");
let con_out = output_path.with_extension("consensus.bam");
let mut con_reader = bam::io::indexed_reader::Builder::default()
.build_from_path(&con_out)
.expect("failed to open indexed BAM");
let con_header = con_reader.read_header().expect("failed to read BAM header");
let mut h_reads = 0;
let mut con_record = noodles::sam::alignment::RecordBuf::default();
while con_reader
.read_record_buf(&con_header, &mut con_record)
.expect("failed to read BAM record")
> 0
{
let name = String::from_utf8_lossy(
con_record.name().expect("record should have name").as_ref(),
)
.to_string();
if name == "H" {
h_reads += 1;
}
}
assert_eq!(h_reads, 2, "Should extract both ends of pair H overlapping variant at chr2:20");
}
#[test]
fn test_missing_fasta_index_fails() {
let temp_dir = TempDir::new().expect("failed to create temp dir");
let ref_path = temp_dir.path().join("ref.fa");
let fasta_content = b">chr1\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n";
std::fs::write(&ref_path, fasta_content).expect("failed to write file");
let (raw_path, consensus_path) = test_utils::create_test_bams(&temp_dir);
let vcf_path = test_utils::create_empty_vcf(&temp_dir);
let output_path = temp_dir.path().join("output");
let review = Review {
input: vcf_path,
consensus_bam: consensus_path,
grouped_bam: raw_path,
reference: ref_path,
output: output_path,
sample: None,
ignore_ns: false,
maf: 0.05,
};
let result = review.execute("test");
assert!(result.is_err(), "Should fail when FASTA index is missing");
}
#[test]
fn test_missing_fasta_dict_fails() {
let temp_dir = TempDir::new().expect("failed to create temp dir");
let ref_path = temp_dir.path().join("ref.fa");
let fai_path = temp_dir.path().join("ref.fa.fai");
let fasta_content = b">chr1\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n";
std::fs::write(&ref_path, fasta_content).expect("failed to write file");
let fai_content = b"chr1\t100\t6\t100\t101\n";
std::fs::write(&fai_path, fai_content).expect("failed to write file");
let (raw_path, consensus_path) = test_utils::create_test_bams(&temp_dir);
let vcf_path = test_utils::create_empty_vcf(&temp_dir);
let output_path = temp_dir.path().join("output");
let review = Review {
input: vcf_path,
consensus_bam: consensus_path,
grouped_bam: raw_path,
reference: ref_path,
output: output_path,
sample: None,
ignore_ns: false,
maf: 0.05,
};
let result = review.execute("test");
assert!(result.is_err(), "Should fail when FASTA dictionary is missing");
}
}