use crate::bam_io::create_bam_reader;
use crate::progress::ProgressTracker;
use crate::sam::SamTag;
use crate::template::TemplateIterator;
use anyhow::{Context, Result};
use log::info;
use murmur3::murmur3_32;
use noodles::sam::alignment::record::Cigar;
use noodles::sam::alignment::record::cigar::op::Kind;
use noodles::sam::alignment::record::data::field::Tag;
use noodles::sam::alignment::record_buf::RecordBuf;
use std::path::Path;
use std::sync::OnceLock;
pub const DOWNSAMPLING_FRACTIONS: [f64; 20] = [
0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80,
0.85, 0.90, 0.95, 1.00,
];
static R_AVAILABLE: OnceLock<bool> = OnceLock::new();
#[derive(Clone, Debug)]
pub struct Interval {
pub ref_name: String,
pub start: i32,
pub end: i32,
}
#[derive(Clone)]
pub struct TemplateInfo {
pub mi: String,
pub rx: String,
pub ref_name: Option<String>,
pub position: Option<i32>,
pub end_position: Option<i32>,
pub hash_fraction: f64,
}
#[derive(Clone, PartialEq, Eq, Hash)]
pub struct ReadInfoKey {
pub ref_index1: Option<usize>,
pub start1: i32,
pub strand1: bool,
pub ref_index2: Option<usize>,
pub start2: i32,
pub strand2: bool,
}
pub struct TemplateMetadata<'a> {
pub template: &'a TemplateInfo,
pub base_umi: &'a str,
pub is_a_strand: bool,
pub is_b_strand: bool,
}
pub fn unclipped_five_prime_position(record: &RecordBuf) -> Option<i32> {
let is_reverse = record.flags().is_reverse_complemented();
let cigar = record.cigar();
if is_reverse {
let alignment_end = record.alignment_end().map(|p| usize::from(p) as i32)?;
let trailing_clips: i32 = cigar
.iter()
.filter_map(std::result::Result::ok)
.last()
.filter(|op| op.kind() == Kind::SoftClip)
.map_or(0, |op| op.len() as i32);
Some(alignment_end + trailing_clips)
} else {
let alignment_start = record.alignment_start().map(|p| usize::from(p) as i32)?;
let leading_clips: i32 = cigar
.iter()
.find_map(std::result::Result::ok)
.filter(|op| op.kind() == Kind::SoftClip)
.map_or(0, |op| op.len() as i32);
Some(alignment_start - leading_clips)
}
}
pub fn compute_hash_fraction(read_name: &str) -> f64 {
let hash = murmur3_32(&mut std::io::Cursor::new(read_name.as_bytes()), 42).unwrap_or(0);
let positive_hash = (hash as i32).unsigned_abs() as f64;
positive_hash / i32::MAX as f64
}
pub fn parse_intervals(path: &Path) -> Result<Vec<Interval>> {
use std::fs::File;
use std::io::{BufRead, BufReader};
let file = File::open(path)?;
let reader = BufReader::new(file);
let mut intervals = Vec::new();
let mut is_interval_list = false;
for line in reader.lines() {
let line = line?;
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
if line.starts_with('@') {
is_interval_list = true;
continue;
}
let mut fields = line.splitn(4, '\t');
let ref_name = fields.next().expect("splitn always yields at least one element");
let start_str = fields.next();
let end_str = fields.next();
let (Some(start_str), Some(end_str)) = (start_str, end_str) else {
let fmt = if is_interval_list { "interval list" } else { "BED" };
anyhow::bail!("Invalid {fmt} line (needs at least 3 fields): {line}");
};
if is_interval_list {
let start: i32 = start_str
.parse::<i32>()
.map_err(|_| anyhow::anyhow!("Invalid start position: {start_str}"))?
- 1; let end: i32 =
end_str.parse().map_err(|_| anyhow::anyhow!("Invalid end position: {end_str}"))?;
intervals.push(Interval { ref_name: ref_name.to_string(), start, end });
} else {
let start: i32 = start_str
.parse()
.map_err(|_| anyhow::anyhow!("Invalid start position: {start_str}"))?;
let end: i32 =
end_str.parse().map_err(|_| anyhow::anyhow!("Invalid end position: {end_str}"))?;
intervals.push(Interval { ref_name: ref_name.to_string(), start, end });
}
}
Ok(intervals)
}
pub fn overlaps_intervals(template: &TemplateInfo, intervals: &[Interval]) -> bool {
if intervals.is_empty() {
return true; }
if let (Some(ref_name), Some(start), Some(end)) =
(&template.ref_name, template.position, template.end_position)
{
intervals.iter().any(|interval| {
interval.ref_name == *ref_name && start <= interval.end && interval.start < end
})
} else {
false }
}
pub fn validate_not_consensus_bam(input: &Path) -> Result<()> {
use crate::consensus_tags::is_consensus;
let (mut reader, header) = create_bam_reader(input, 1)?;
for result in reader.record_bufs(&header) {
let record = result?;
let flags = record.flags();
if !flags.is_segmented()
|| !flags.is_first_segment()
|| flags.is_secondary()
|| flags.is_supplementary()
|| flags.is_unmapped()
{
continue;
}
if is_consensus(&record) {
anyhow::bail!(
"Input BAM file ({}) appears to contain consensus sequences. \
This metrics tool cannot run on consensus BAMs, and instead requires \
the UMI-grouped BAM generated by group which is run prior to consensus calling.\n\
First R1 record '{}' has consensus SAM tags present.",
input.display(),
record.name().map_or_else(
|| "<unnamed>".to_string(),
|n| String::from_utf8_lossy(n.as_ref()).to_string()
)
);
}
break;
}
Ok(())
}
pub fn is_r_available() -> bool {
use std::process::Command;
*R_AVAILABLE.get_or_init(|| {
Command::new("Rscript")
.args(["-e", "stopifnot(require(ggplot2)); stopifnot(require(scales))"])
.output()
.map(|output| output.status.success())
.unwrap_or(false)
})
}
pub fn execute_r_script(r_script_content: &str, args: &[&str], temp_file_name: &str) -> Result<()> {
use std::process::Command;
let temp_dir = std::env::temp_dir();
let r_script_path = temp_dir.join(temp_file_name);
std::fs::write(&r_script_path, r_script_content)
.context("Failed to write embedded R script to temp file")?;
info!("Executing R script to generate PDF plots...");
let output = Command::new("Rscript")
.arg(&r_script_path)
.args(args)
.output()
.context("Failed to execute Rscript command")?;
let _ = std::fs::remove_file(&r_script_path);
if output.status.success() {
Ok(())
} else {
let stderr = String::from_utf8_lossy(&output.stderr);
anyhow::bail!(
"R script execution failed with exit code {:?}. Error: {}",
output.status.code(),
stderr
)
}
}
pub fn compute_template_metadata(group: &[TemplateInfo]) -> Vec<TemplateMetadata<'_>> {
group
.iter()
.map(|t| {
let (base_umi, is_a, is_b) = if t.mi.ends_with("/A") {
(&t.mi[..t.mi.len() - 2], true, false)
} else if t.mi.ends_with("/B") {
(&t.mi[..t.mi.len() - 2], false, true)
} else {
(t.mi.as_str(), false, false)
};
TemplateMetadata { template: t, base_umi, is_a_strand: is_a, is_b_strand: is_b }
})
.collect()
}
pub fn process_templates_from_bam<F>(
input: &Path,
intervals: &[Interval],
num_fractions: usize,
mut process_group: F,
) -> Result<(usize, Vec<usize>)>
where
F: FnMut(&[TemplateInfo], &mut Vec<usize>),
{
let (mut reader, header) = create_bam_reader(input, 1)?;
let record_iter = reader.record_bufs(&header).map(|r| r.map_err(Into::into));
let template_iter = TemplateIterator::new(record_iter);
let mut current_group: Vec<TemplateInfo> = Vec::new();
let mut current_key: Option<ReadInfoKey> = None;
let mut template_count = 0;
let progress = ProgressTracker::new("Processed records").with_interval(1_000_000);
let mut fraction_template_counts: Vec<usize> = vec![0; num_fractions];
let mi_tag = Tag::from(SamTag::MI);
let umi_tag = Tag::from(SamTag::RX);
for template in template_iter {
let template = template?;
if template.records.len() < 2 {
continue;
}
let r1 = template.records.iter().find(|r| {
let f = r.flags();
f.is_segmented()
&& !f.is_unmapped()
&& !f.is_mate_unmapped()
&& f.is_first_segment()
&& !f.is_secondary()
&& !f.is_supplementary()
});
let r2 = template.records.iter().find(|r| {
let f = r.flags();
f.is_segmented()
&& !f.is_unmapped()
&& !f.is_mate_unmapped()
&& f.is_last_segment()
&& !f.is_secondary()
&& !f.is_supplementary()
});
let (r1, r2) = match (r1, r2) {
(Some(r1), Some(r2)) => (r1, r2),
_ => continue,
};
let read_name =
r1.name().map(|n| String::from_utf8_lossy(n.as_ref()).to_string()).unwrap_or_default();
let mi = if let Some(noodles::sam::alignment::record_buf::data::field::Value::String(s)) =
r1.data().get(&mi_tag)
{
String::from_utf8(s.iter().copied().collect::<Vec<u8>>())?
} else {
return Err(anyhow::anyhow!(
"Read '{}' is missing the required MI tag. \
Metrics commands require standard MI/RX tags.",
read_name
));
};
let rx = if let Some(noodles::sam::alignment::record_buf::data::field::Value::String(s)) =
r1.data().get(&umi_tag)
{
String::from_utf8(s.iter().copied().collect::<Vec<u8>>())?
} else {
return Err(anyhow::anyhow!(
"Read '{}' is missing the required RX tag. \
Metrics commands require standard MI/RX tags.",
read_name
));
};
let ref_name = if let Some(ref_id) = r1.reference_sequence_id() {
header.reference_sequences().get_index(ref_id).map(|(name, _)| name.to_string())
} else {
None
};
let (position, end_position, read_info_key) = {
let r1_ref = r1.reference_sequence_id();
let r2_ref = r2.reference_sequence_id();
if r1_ref == r2_ref && r1_ref.is_some() {
let r1_5prime = unclipped_five_prime_position(r1);
let r2_5prime = unclipped_five_prime_position(r2);
let r1_strand = r1.flags().is_reverse_complemented();
let r2_strand = r2.flags().is_reverse_complemented();
if let (Some(s1), Some(s2)) = (r1_5prime, r2_5prime) {
let r1_start = r1.alignment_start().map(|p| usize::from(p) as i32);
let r2_start = r2.alignment_start().map(|p| usize::from(p) as i32);
let r1_end = r1.alignment_end().map(|p| usize::from(p) as i32);
let r2_end = r2.alignment_end().map(|p| usize::from(p) as i32);
let (pos, end) = match (r1_start, r2_start, r1_end, r2_end) {
(Some(rs1), Some(rs2), Some(re1), Some(re2)) => {
(rs1.min(rs2), re1.max(re2))
}
(Some(rs1), Some(rs2), _, _) => (rs1.min(rs2), rs1.max(rs2)),
_ => continue,
};
let key = if (r1_ref, s1) <= (r2_ref, s2) {
ReadInfoKey {
ref_index1: r1_ref,
start1: s1,
strand1: r1_strand,
ref_index2: r2_ref,
start2: s2,
strand2: r2_strand,
}
} else {
ReadInfoKey {
ref_index1: r2_ref,
start1: s2,
strand1: r2_strand,
ref_index2: r1_ref,
start2: s1,
strand2: r1_strand,
}
};
(pos, end, key)
} else {
continue;
}
} else {
continue;
}
};
let hash_fraction = compute_hash_fraction(&read_name);
let template_info = TemplateInfo {
mi,
rx,
ref_name,
position: Some(position),
end_position: Some(end_position),
hash_fraction,
};
if !overlaps_intervals(&template_info, intervals) {
continue;
}
template_count += 1;
progress.log_if_needed(2);
if current_key.as_ref() != Some(&read_info_key) && !current_group.is_empty() {
process_group(¤t_group, &mut fraction_template_counts);
current_group.clear();
}
current_group.push(template_info);
current_key = Some(read_info_key);
}
if !current_group.is_empty() {
process_group(¤t_group, &mut fraction_template_counts);
}
progress.log_final();
Ok((template_count, fraction_template_counts))
}