use crate::gtf::Gene;
use anyhow::{ensure, Context, Result};
use coitrees::{COITree, Interval, IntervalTree};
use indexmap::IndexMap;
fn format_sig15(v: f64) -> String {
if v == 0.0 {
return "0".to_string();
}
let abs_v = v.abs();
let digits_before = (abs_v.log10().floor() as i32) + 1;
let decimal_places = (15 - digits_before).max(0) as usize;
let s = format!("{:.*}", decimal_places, v);
if s.contains('.') {
s.trim_end_matches('0').trim_end_matches('.').to_string()
} else {
s
}
}
use std::collections::{HashMap, HashSet};
use std::io::Write;
#[derive(Debug)]
pub struct InnerDistanceResult {
pub pairs: Vec<PairRecord>,
pub histogram: Vec<(i64, i64, u64)>,
pub total_pairs: u64,
}
#[derive(Debug)]
pub struct PairRecord {
pub name: String,
pub distance: Option<i64>,
pub classification: String,
}
#[derive(Debug, Default)]
pub struct ExonBitset {
pub intervals: HashMap<String, Vec<(u64, u64)>>,
}
impl ExonBitset {
pub fn from_genes(genes: &IndexMap<String, Gene>) -> Self {
let mut bitset = ExonBitset::default();
for gene in genes.values() {
for tx in &gene.transcripts {
let chrom = tx.chrom.to_uppercase();
for &(start, end) in &tx.exons {
bitset.add(&chrom, start - 1, end);
}
}
}
bitset.merge();
bitset
}
pub fn add(&mut self, chrom: &str, start: u64, end: u64) {
self.intervals
.entry(chrom.to_string())
.or_default()
.push((start, end));
}
fn merge(&mut self) {
for intervals in self.intervals.values_mut() {
intervals.sort();
let mut merged: Vec<(u64, u64)> = Vec::new();
for &(start, end) in intervals.iter() {
if let Some(last) = merged.last_mut() {
if start <= last.1 {
last.1 = last.1.max(end);
} else {
merged.push((start, end));
}
} else {
merged.push((start, end));
}
}
*intervals = merged;
}
}
pub fn has_chrom(&self, chrom: &str) -> bool {
self.intervals.contains_key(chrom)
}
pub fn count_exonic_bases(&self, chrom: &str, start: u64, end: u64) -> u64 {
let intervals = match self.intervals.get(chrom) {
Some(iv) => iv,
None => return 0,
};
let idx = match intervals.binary_search_by_key(&start, |&(s, _)| s) {
Ok(i) => i,
Err(i) => {
if i > 0 {
i - 1
} else {
0
}
}
};
let mut count = 0u64;
for &(iv_start, iv_end) in &intervals[idx..] {
if iv_start >= end {
break;
}
let overlap_start = start.max(iv_start);
let overlap_end = end.min(iv_end);
if overlap_start < overlap_end {
count += overlap_end - overlap_start;
}
}
count
}
}
#[derive(Default)]
pub struct TranscriptTree {
trees: HashMap<String, COITree<u32, u32>>,
names: Vec<String>,
pending: HashMap<String, Vec<(u64, u64, String)>>,
}
impl TranscriptTree {
pub fn from_genes(genes: &IndexMap<String, Gene>) -> Self {
let mut tree = TranscriptTree::default();
let mut per_chrom: HashMap<String, Vec<(u64, u64, String)>> = HashMap::new();
for gene in genes.values() {
for tx in &gene.transcripts {
let chrom = tx.chrom.to_uppercase();
let start = tx.start - 1;
let end = tx.end;
let name = format!("{}:{}:{}-{}", tx.transcript_id, chrom, start, end);
per_chrom.entry(chrom).or_default().push((start, end, name));
}
}
for (chrom, transcripts) in &per_chrom {
for (start, end, name) in transcripts.iter().skip(1) {
tree.add(chrom, *start, *end, name);
}
}
tree.build();
tree
}
pub fn add(&mut self, chrom: &str, start: u64, end: u64, name: &str) {
self.pending
.entry(chrom.to_string())
.or_default()
.push((start, end, name.to_string()));
}
pub fn build(&mut self) {
let pending = std::mem::take(&mut self.pending);
for (chrom, intervals) in pending {
let coitree_intervals: Vec<Interval<u32>> = intervals
.into_iter()
.map(|(start, end, name)| {
let idx = self.names.len() as u32;
self.names.push(name);
debug_assert!(
start <= i32::MAX as u64 && end <= i32::MAX as u64,
"Coordinate overflow: TranscriptTree requires coordinates < 2^31"
);
Interval::new(start as i32, (end.saturating_sub(1)) as i32, idx)
})
.collect();
self.trees.insert(chrom, COITree::new(&coitree_intervals));
}
}
pub fn find_overlapping(&self, chrom: &str, point: u64) -> HashSet<String> {
let mut result = HashSet::new();
if let Some(tree) = self.trees.get(chrom) {
let p = point as i32;
tree.query(p, p, |node| {
fn as_usize(v: impl std::borrow::Borrow<u32>) -> usize {
*v.borrow() as usize
}
let idx = as_usize(node.metadata);
if idx < self.names.len() {
result.insert(self.names[idx].clone());
}
});
}
result
}
}
pub fn build_histogram(
distances: &[i64],
lower_bound: i64,
upper_bound: i64,
step: i64,
) -> Result<Vec<(i64, i64, u64)>> {
ensure!(step > 0, "inner_distance step must be positive, got {step}");
ensure!(
lower_bound < upper_bound,
"inner_distance lower_bound ({lower_bound}) must be less than upper_bound ({upper_bound})"
);
let mut sorted = distances.to_vec();
sorted.sort_unstable();
let mut bins: Vec<(i64, i64, u64)> = Vec::new();
let mut cursor = 0;
let mut bin_start = lower_bound;
while bin_start < upper_bound {
let bin_end = bin_start + step;
while cursor < sorted.len() && sorted[cursor] <= bin_start {
cursor += 1;
}
let start_cursor = cursor;
while cursor < sorted.len() && sorted[cursor] <= bin_end {
cursor += 1;
}
bins.push((bin_start, bin_end, (cursor - start_cursor) as u64));
bin_start = bin_end;
}
Ok(bins)
}
pub fn write_detail_file(result: &InnerDistanceResult, path: &str) -> Result<()> {
let mut file = std::fs::File::create(path)
.with_context(|| format!("Failed to create detail file: {}", path))?;
for pair in &result.pairs {
let dist_str = match pair.distance {
Some(d) => d.to_string(),
None => "NA".to_string(),
};
writeln!(file, "{}\t{}\t{}", pair.name, dist_str, pair.classification)?;
}
Ok(())
}
pub fn write_freq_file(result: &InnerDistanceResult, path: &str) -> Result<()> {
let mut file = std::fs::File::create(path)
.with_context(|| format!("Failed to create frequency file: {}", path))?;
for &(bin_start, bin_end, count) in &result.histogram {
writeln!(file, "{}\t{}\t{}", bin_start, bin_end, count)?;
}
Ok(())
}
pub fn write_r_script(
result: &InnerDistanceResult,
prefix: &str,
path: &str,
step: i64,
) -> Result<()> {
let mut file = std::fs::File::create(path)
.with_context(|| format!("Failed to create R script: {}", path))?;
let bin_centers: Vec<String> = result
.histogram
.iter()
.map(|&(start, _end, _count)| format!("{}", start as f64 + step as f64 / 2.0))
.collect();
let counts: Vec<String> = result
.histogram
.iter()
.map(|&(_start, _end, count)| count.to_string())
.collect();
let num_bins = result.histogram.len();
writeln!(file, "out_file = '{}'", prefix)?;
writeln!(file, "pdf('{}.inner_distance_plot.pdf')", prefix)?;
writeln!(
file,
"fragsize=rep(c({}),times=c({}))",
bin_centers.join(","),
counts.join(",")
)?;
writeln!(file, "frag_sd = sd(fragsize)")?;
writeln!(file, "frag_mean = mean(fragsize)")?;
writeln!(file, "frag_median = median(fragsize)")?;
writeln!(
file,
"write(x=c(\"Name\",\"Mean\",\"Median\",\"sd\"), sep=\"\t\", file=stdout(),ncolumns=4)"
)?;
writeln!(
file,
"write(c(out_file,frag_mean,frag_median,frag_sd),sep=\"\t\", file=stdout(),ncolumns=4)"
)?;
writeln!(
file,
"hist(fragsize,probability=T,breaks={},xlab=\"mRNA insert size (bp)\",main=paste(c(\"Mean=\",frag_mean,\";\",\"SD=\",frag_sd),collapse=\"\"),border=\"blue\")",
num_bins
)?;
writeln!(file, "lines(density(fragsize,bw={}),col='red')", 2 * step)?;
writeln!(file, "dev.off()")?;
Ok(())
}
pub fn write_summary(result: &InnerDistanceResult, path: &str) -> Result<()> {
let mut file = std::fs::File::create(path)
.with_context(|| format!("Failed to create summary file: {}", path))?;
let mut same_chrom_no = 0u64;
let mut same_transcript_no = 0u64;
let mut same_exon_yes = 0u64;
let mut same_exon_no = 0u64;
let mut non_exonic = 0u64;
let mut unknown_chrom = 0u64;
let mut overlap = 0u64;
for pair in &result.pairs {
match pair.classification.as_str() {
"sameChrom=No" => same_chrom_no += 1,
"sameTranscript=No,dist=genomic" => same_transcript_no += 1,
"sameTranscript=Yes,sameExon=Yes,dist=mRNA" => same_exon_yes += 1,
"sameTranscript=Yes,sameExon=No,dist=mRNA" => same_exon_no += 1,
"sameTranscript=Yes,nonExonic=Yes,dist=genomic" => non_exonic += 1,
"unknownChromosome,dist=genomic" => unknown_chrom += 1,
"readPairOverlap" => overlap += 1,
_ => {}
}
}
writeln!(file, "Total read pairs: {}", result.total_pairs)?;
writeln!(file, "Different chromosome: {}", same_chrom_no)?;
writeln!(file, "Different transcript: {}", same_transcript_no)?;
writeln!(file, "Same exon (mRNA distance): {}", same_exon_yes)?;
writeln!(file, "Different exon (mRNA distance): {}", same_exon_no)?;
writeln!(file, "Non-exonic (genomic distance): {}", non_exonic)?;
writeln!(file, "Unknown chromosome: {}", unknown_chrom)?;
writeln!(file, "Read pair overlap: {}", overlap)?;
if !result.histogram.is_empty() {
let total: u64 = result.histogram.iter().map(|&(_, _, c)| c).sum();
if total > 0 {
let step = if result.histogram.len() > 1 {
result.histogram[1].0 - result.histogram[0].0
} else {
5
};
let mean: f64 = result
.histogram
.iter()
.map(|&(s, _, c)| (s as f64 + step as f64 / 2.0) * c as f64)
.sum::<f64>()
/ total as f64;
writeln!(file, "Mean inner distance: {:.2}", mean)?;
}
}
Ok(())
}
pub fn write_mean_file(result: &InnerDistanceResult, sample_name: &str, path: &str) -> Result<()> {
let mut file = std::fs::File::create(path)
.with_context(|| format!("Failed to create mean file: {}", path))?;
writeln!(file, "Name\tMean\tMedian\tsd")?;
if !result.histogram.is_empty() {
let total: u64 = result.histogram.iter().map(|&(_, _, c)| c).sum();
if total > 0 {
let step = if result.histogram.len() > 1 {
result.histogram[1].0 - result.histogram[0].0
} else {
5
};
let mut values: Vec<f64> = Vec::with_capacity(total as usize);
for &(start, _, count) in &result.histogram {
let center = start as f64 + step as f64 / 2.0;
for _ in 0..count {
values.push(center);
}
}
let n = values.len() as f64;
let mean: f64 = values.iter().sum::<f64>() / n;
values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let median = if values.len().is_multiple_of(2) {
(values[values.len() / 2 - 1] + values[values.len() / 2]) / 2.0
} else {
values[values.len() / 2]
};
let variance: f64 = values.iter().map(|v| (v - mean).powi(2)).sum::<f64>() / (n - 1.0);
let sd = variance.sqrt();
writeln!(
file,
"{}\t{}\t{}\t{}",
sample_name,
format_sig15(mean),
format_sig15(median),
format_sig15(sd),
)?;
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_exon_bitset_merge() {
let mut bitset = ExonBitset::default();
bitset.add("CHR1", 100, 200);
bitset.add("CHR1", 150, 300);
bitset.add("CHR1", 400, 500);
bitset.merge();
let intervals = &bitset.intervals["CHR1"];
assert_eq!(intervals.len(), 2);
assert_eq!(intervals[0], (100, 300));
assert_eq!(intervals[1], (400, 500));
}
#[test]
fn test_exon_bitset_count() {
let mut bitset = ExonBitset::default();
bitset.add("CHR1", 100, 200);
bitset.add("CHR1", 300, 400);
bitset.merge();
assert_eq!(bitset.count_exonic_bases("CHR1", 100, 200), 100);
assert_eq!(bitset.count_exonic_bases("CHR1", 150, 350), 100); assert_eq!(bitset.count_exonic_bases("CHR1", 200, 300), 0);
assert_eq!(bitset.count_exonic_bases("CHR2", 100, 200), 0);
}
#[test]
fn test_histogram() {
let distances = vec![-5, -3, 0, 2, 7, 12];
let hist = build_histogram(&distances, -10, 15, 5).unwrap();
assert_eq!(hist.len(), 5);
assert_eq!(hist[0], (-10, -5, 1)); assert_eq!(hist[1], (-5, 0, 2)); assert_eq!(hist[2], (0, 5, 1)); assert_eq!(hist[3], (5, 10, 1)); assert_eq!(hist[4], (10, 15, 1)); }
#[test]
fn test_transcript_tree_first_per_chrom_dropped() {
let mut tree = TranscriptTree::default();
tree.add("CHR1", 100, 200, "gene1");
tree.add("CHR1", 150, 300, "gene2");
tree.build();
let genes = tree.find_overlapping("CHR1", 175);
assert!(genes.contains("gene1")); assert!(genes.contains("gene2"));
}
}