use std::fs::File;
use std::io::{BufWriter, Write};
use std::num::NonZeroUsize;
use std::path::Path;
use anyhow::{Context, anyhow};
use tracing::{info, warn};
use noodles_sam::header::Header;
use noodles_sam::header::record::value as header_val;
use noodles_sam::header::record::value::Map as HeaderMap;
use bramble_rs::GenomicAlignment;
use bramble_rs::ProjectedAlignment;
use bramble_rs::ProjectionConfig;
use bramble_rs::annotation::{Transcript, load_transcripts};
use bramble_rs::fasta::FastaDb;
use bramble_rs::g2t::{G2TTree, build_g2t_from_refnames};
use crate::prog_opts::Args;
use crate::util::oarfish_types::{ProjectedAlnRecord, TranscriptInfo};
pub fn load_g2t(
annotation: &Path,
refnames: &[String],
rescue_fasta: Option<FastaDb>,
) -> anyhow::Result<G2TTree> {
info!("loading annotation from {}", annotation.display());
let transcripts = load_transcripts(annotation)
.with_context(|| format!("failed to load annotation {}", annotation.display()))?;
info!("loaded {} transcripts from annotation", transcripts.len());
build_g2t_from_transcripts(&transcripts, refnames, rescue_fasta)
}
pub fn build_g2t_from_transcripts(
transcripts: &[Transcript],
refnames: &[String],
rescue_fasta: Option<FastaDb>,
) -> anyhow::Result<G2TTree> {
let g2t = build_g2t_from_refnames(transcripts, refnames, rescue_fasta.as_ref())
.context("failed to build genome-to-transcriptome index")?;
info!("built g2t index over {} transcripts", g2t.num_transcripts());
Ok(g2t)
}
pub fn write_annotation_junction_bed(
transcripts: &[Transcript],
path: &Path,
) -> anyhow::Result<usize> {
let f = File::create(path)
.with_context(|| format!("failed to create junction BED {}", path.display()))?;
let mut w = BufWriter::new(f);
let mut n = 0usize;
for tx in transcripts {
if tx.exons.len() < 2 {
continue; }
let mut exons: Vec<(u32, u32)> = tx
.exons
.iter()
.map(|e| (e.start.saturating_sub(1), e.end.saturating_sub(1)))
.collect();
exons.sort_by_key(|e| e.0);
let chrom_start = exons[0].0;
let chrom_end = exons.last().unwrap().1;
let strand = match tx.strand {
'+' => '+',
'-' => '-',
_ => '.',
};
let mut sizes = String::new();
let mut starts = String::new();
for (s, e) in &exons {
sizes.push_str(&(e - s).to_string());
sizes.push(',');
starts.push_str(&(s - chrom_start).to_string());
starts.push(',');
}
writeln!(
w,
"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
tx.seqname,
chrom_start,
chrom_end,
tx.id,
1000,
strand,
chrom_start,
chrom_end,
0,
exons.len(),
sizes,
starts,
)?;
n += 1;
}
w.flush()?;
Ok(n)
}
pub fn build_transcriptome_header_and_info(
g2t: &G2TTree,
args: &Args,
) -> anyhow::Result<(Header, Vec<TranscriptInfo>, Vec<String>)> {
let n = g2t.num_transcripts();
let mut builder = Header::builder();
let mut txps: Vec<TranscriptInfo> = Vec::with_capacity(n);
let mut names: Vec<String> = Vec::with_capacity(n);
let mut n_zero_len = 0usize;
for tid in 0..n {
let tid = tid as u32;
let name = g2t
.transcript_name(tid)
.ok_or_else(|| anyhow!("g2t index missing a name for transcript id {tid}"))?
.to_string();
let raw_len = g2t
.transcript_len(tid)
.ok_or_else(|| anyhow!("g2t index missing a length for transcript id {tid}"))?;
if raw_len == 0 {
n_zero_len += 1;
}
let len = NonZeroUsize::new(raw_len.max(1) as usize).expect("len >= 1");
builder = builder.add_reference_sequence(
name.clone(),
HeaderMap::<header_val::map::ReferenceSequence>::new(len),
);
let info = if args.model_coverage {
TranscriptInfo::with_len_and_bin_width(len, args.bin_width)
} else {
TranscriptInfo::with_len(len)
};
txps.push(info);
names.push(name);
}
builder = builder.add_program(
"bramble",
HeaderMap::<header_val::map::Program>::default(),
);
if n_zero_len > 0 {
warn!(
"{} transcript(s) in the annotation had zero exonic length; they were retained with length 1.",
n_zero_len
);
}
info!(
"constructed transcriptome header with {} transcripts.",
names.len()
);
Ok((builder.build(), txps, names))
}
pub fn projected_to_records(
projected: &[ProjectedAlignment],
src_scores: &[i32],
) -> Vec<ProjectedAlnRecord> {
projected
.iter()
.map(|p| ProjectedAlnRecord {
ref_id: p.transcript_id,
start: p.transcript_start,
end: p.transcript_end,
aligned_len: p.aligned_len,
query_aligned_len: p.query_aligned_len,
is_reverse: p.is_reverse,
similarity: p.similarity_score,
aln_score: src_scores.get(p.input_index).copied().unwrap_or(0),
})
.collect()
}
pub fn rescue_fasta_path(args: &Args) -> Option<std::path::PathBuf> {
if args.no_rescue {
return None;
}
if let Some(p) = &args.genome_fasta {
return Some(p.clone());
}
if let Some(g) = &args.genome
&& crate::util::file_utils::is_fasta(g).unwrap_or(false)
{
return Some(g.clone());
}
None
}
pub fn load_rescue_fasta(args: &Args) -> anyhow::Result<Option<FastaDb>> {
match rescue_fasta_path(args) {
Some(p) => {
info!("loading genome FASTA for soft-clip rescue from {}", p.display());
Ok(Some(FastaDb::load(&p).with_context(|| {
format!("failed to load genome FASTA {}", p.display())
})?))
}
None => Ok(None),
}
}
pub fn projection_config(args: &Args, use_fasta: bool) -> ProjectionConfig {
ProjectionConfig {
long_reads: true,
use_fasta,
junc_miss_discount: args.junc_miss_discount,
}
}
pub fn revcomp(seq: &[u8]) -> Vec<u8> {
seq.iter()
.rev()
.map(|&b| match b {
b'A' | b'a' => b'T',
b'C' | b'c' => b'G',
b'G' | b'g' => b'C',
b'T' | b't' => b'A',
b'U' | b'u' => b'A',
_ => b'N',
})
.collect()
}
pub fn mapping_to_genomic_alignment(
m: &crate::util::mapper::OMapping,
query_name: &str,
read_len: usize,
) -> Option<GenomicAlignment> {
let cigar_ops = m.m.cigar_ops.as_ref()?;
if cigar_ops.is_empty() {
return None;
}
let is_reverse = matches!(m.m.strand, rammap::api::Strand::Reverse);
let clip5 = if is_reverse {
read_len.saturating_sub(m.m.query_end)
} else {
m.m.query_start
};
let clip3 = if is_reverse {
m.m.query_start
} else {
read_len.saturating_sub(m.m.query_end)
};
let mut cigar: Vec<(u32, u8)> = Vec::with_capacity(cigar_ops.len() + 2);
if clip5 > 0 {
cigar.push((clip5 as u32, 4)); }
cigar.extend(cigar_ops.iter().map(|c| (c.len, c.op)));
if clip3 > 0 {
cigar.push((clip3 as u32, 4));
}
let ts_strand = m.m.trans_strand.map(|s| match s {
rammap::api::Strand::Forward => '+',
rammap::api::Strand::Reverse => '-',
});
Some(GenomicAlignment {
query_name: query_name.to_string(),
ref_id: m.m.target_id as i32,
ref_start: (m.m.target_start as i64) + 1, is_reverse,
cigar,
sequence: None,
is_paired: false,
is_first_in_pair: false,
xs_strand: None,
ts_strand,
hit_index: 0,
mate_ref_id: None,
mate_ref_start: None,
mate_is_unmapped: false,
read_len,
})
}