use super::bed;
use super::getfasta;
use super::myio;
use super::trim_overlap::trim_overlapping_pafs;
use bio::alphabets::dna::revcomp;
use core::{fmt, panic};
use itertools::Itertools;
use lazy_static::lazy_static;
use natord;
use regex::Regex;
use rust_htslib::bam::record::Cigar::*;
use rust_htslib::bam::record::CigarString;
use rust_htslib::bam::record::*;
use rust_htslib::faidx;
use std::collections::{HashMap, HashSet};
use std::convert::TryFrom;
use std::io::BufRead;
use std::str::FromStr;
use std::usize;
lazy_static! {
static ref PAF_TAG: Regex = Regex::new("(..):(.):(.*)").unwrap();
}
#[derive(Debug)]
pub enum Error {
PafParseCigar { msg: String },
PafParseCS { msg: String },
ParseIntError { msg: String },
ParsePafColumn {},
}
type PafResult<T> = Result<T, crate::paf::Error>;
#[derive(Debug)]
pub struct Paf {
pub records: Vec<PafRecord>,
}
impl Default for Paf {
fn default() -> Self {
Self::new()
}
}
impl Paf {
pub fn new() -> Paf {
Paf {
records: Vec::new(),
}
}
pub fn from_file(file_name: &str) -> Paf {
let paf_file = myio::reader(file_name);
let mut paf = Paf::new();
for (index, line) in paf_file.lines().enumerate() {
log::trace!("{:?}", line);
match PafRecord::new(&line.unwrap()) {
Ok(mut rec) => {
rec.check_integrity().unwrap();
paf.records.push(rec);
}
Err(_) => eprintln!("\nUnable to parse PAF record. Skipping line {}", index + 1),
}
log::debug!("Read PAF record number: {}", index + 1);
}
paf
}
pub fn query_name_map(&mut self) -> HashMap<String, Vec<&PafRecord>> {
let mut records_by_contig = HashMap::new();
for rec in &self.records {
(*records_by_contig
.entry(rec.q_name.clone())
.or_insert_with(Vec::new))
.push(rec);
}
records_by_contig
}
pub fn filter_aln_pairs(&mut self, paired_len: u64) {
let mut dict = HashMap::new();
for rec in self.records.iter_mut() {
let aln_bp = dict
.entry((rec.t_name.clone(), rec.q_name.clone()))
.or_insert(0_u64);
*aln_bp += rec.t_en - rec.t_st;
}
self.records.retain(|rec| {
paired_len < *dict.get(&(rec.t_name.clone(), rec.q_name.clone())).unwrap()
});
}
pub fn filter_query_len(&mut self, min_query_len: u64) {
self.records.retain(|rec| rec.q_len > min_query_len);
}
pub fn filter_aln_len(&mut self, min_aln_len: u64) {
self.records.retain(|rec| rec.t_en - rec.t_st > min_aln_len);
}
pub fn orient(&mut self) {
let mut orient_order_dict = HashMap::new();
let mut t_names = HashSet::new();
for rec in &self.records {
let (orient, total_bp, order) = orient_order_dict
.entry((rec.t_name.clone(), rec.q_name.clone()))
.or_insert((0_i64, 0_u64, 0_u64));
if rec.strand == '-' {
*orient -= (rec.q_en - rec.q_st) as i64;
} else {
*orient += (rec.q_en - rec.q_st) as i64;
}
let weight = rec.t_en - rec.t_st;
*total_bp += weight;
*order += weight * (rec.t_st + rec.t_en) / 2;
t_names.insert(rec.t_name.clone());
}
for rec in &mut self.records {
let (orient, total_bp, order) = orient_order_dict
.get(&(rec.t_name.clone(), rec.q_name.clone()))
.unwrap();
rec.order = *order / *total_bp;
if *orient < 0 {
rec.q_name = format!("{}-", rec.q_name);
let new_st = rec.q_len - rec.q_en;
let new_en = rec.q_len - rec.q_st;
rec.q_st = new_st;
rec.q_en = new_en;
rec.strand = if rec.strand == '+' { '-' } else { '+' };
} else {
rec.q_name = format!("{}+", rec.q_name);
}
}
}
pub fn scaffold(&mut self, spacer_size: u64) {
self.records.sort_by(|a, b| {
a.t_name
.cmp(&b.t_name) .then(a.order.cmp(&b.order)) .then(a.q_st.cmp(&b.q_st)) });
for (_t_name, t_recs) in &self.records.iter_mut().group_by(|rec| rec.t_name.clone()) {
let mut t_recs: Vec<&mut PafRecord> = t_recs.collect();
t_recs.sort_by(|a, b| {
a.order
.cmp(&b.order) .then(a.q_st.cmp(&b.q_st)) });
let scaffold_name = t_recs
.iter()
.map(|rec| rec.q_name.clone())
.unique()
.collect::<Vec<String>>()
.join("::");
let mut scaffold_len = 0_u64;
for (_q_name, q_recs) in &t_recs.iter_mut().group_by(|rec| rec.q_name.clone()) {
let q_recs: Vec<&mut &mut PafRecord> = q_recs.collect();
let q_min = q_recs.iter().map(|rec| rec.q_st).min().unwrap_or(0);
let q_max = q_recs.iter().map(|rec| rec.q_en).max().unwrap_or(0);
let added_q_bases = q_max - q_min;
for rec in q_recs {
rec.q_st = rec.q_st - q_min + scaffold_len;
rec.q_en = rec.q_en - q_min + scaffold_len;
}
scaffold_len += added_q_bases + spacer_size;
}
scaffold_len -= spacer_size;
for rec in t_recs {
rec.q_name = scaffold_name.clone();
rec.q_len = scaffold_len;
}
}
}
pub fn overlapping_paf_recs(
&mut self,
match_score: i32,
diff_score: i32,
indel_score: i32,
remove_contained: bool,
) {
for rec in &mut self.records {
rec.remove_trailing_indels();
}
let mut overlap_pairs = Vec::new();
self.records.sort_by_key(|rec| rec.q_name.clone());
let mut contained_indexes = vec![false; self.records.len()];
if self.records.len() < 2 {
return;
}
for i in 0..(self.records.len() - 1) {
let rec1 = &self.records[i];
let rgn1 = rec1.get_query_as_region();
let mut j = i + 1;
while j < self.records.len() && rec1.q_name == self.records[j].q_name {
let rec2 = &self.records[j];
let rgn2 = rec2.get_query_as_region();
let overlap = bed::get_overlap(&rgn1, &rgn2);
if overlap < 1 {
j += 1;
continue;
} else if overlap == (rec2.q_en - rec2.q_st) {
contained_indexes[j] = true;
log::debug!("{}\n^is contained in another alignment", rec2);
} else if overlap == (rec1.q_en - rec1.q_st) {
contained_indexes[i] = true;
log::debug!("{}\n^is contained in another alignment", rec1);
} else {
if rec1.q_st <= rec2.q_st {
overlap_pairs.push((overlap, i, j));
} else {
overlap_pairs.push((overlap, j, i));
}
}
j += 1;
}
}
overlap_pairs.sort_by_key(|rec| std::u64::MAX - rec.0);
log::debug!("{} overlapping pairs found", overlap_pairs.len());
let mut q_seen: HashSet<String> = HashSet::new();
let mut unseen = 0;
for (_overlap, i, j) in overlap_pairs {
let mut left = self.records[i].clone();
let mut right = self.records[j].clone();
let q_name = left.q_name.clone();
if !q_seen.contains(&q_name) {
left.aligned_pairs();
right.aligned_pairs();
trim_overlapping_pafs(&mut left, &mut right, match_score, diff_score, indel_score);
log::trace!("{}", left);
log::trace!("{}", right);
self.records[i] = left;
self.records[j] = right;
q_seen.insert(q_name);
} else {
unseen += 1;
}
}
if unseen > 0 {
self.overlapping_paf_recs(match_score, diff_score, indel_score, remove_contained);
} else if remove_contained {
let n_to_remove = contained_indexes.iter().filter(|&x| *x).count();
log::info!("Removing {} contained alignments.", n_to_remove);
log::info!("{} total alignments.", self.records.len());
let mut new_records = Vec::new();
assert!(self.records.len() == contained_indexes.len());
for (i, rec) in self.records.iter().enumerate() {
if !contained_indexes[i] {
new_records.push(rec.clone());
}
}
self.records = new_records;
log::info!("{} total alignments.", self.records.len());
}
}
pub fn sam_header(&self) -> String {
let mut header = "@HD\tVN:1.6\n".to_string();
let mut names: Vec<(String, u64)> = self
.records
.iter()
.map(|rec| (rec.t_name.clone(), rec.t_len))
.unique()
.collect();
names.sort_by(|a, b| natord::compare(&a.0, &b.0));
for (name, length) in names {
header.push_str(&format!("@SQ\tSN:{}\tLN:{}\n", name, length));
}
header.push_str("@PG\tID:rustybam\tPN:rustybam");
header
}
}
#[derive(Debug, Clone)]
pub struct PafRecord {
pub q_name: String,
pub q_len: u64,
pub q_st: u64,
pub q_en: u64,
pub strand: char,
pub t_name: String,
pub t_len: u64,
pub t_st: u64,
pub t_en: u64,
pub nmatch: u64,
pub aln_len: u64,
pub mapq: u64,
pub cigar: CigarString,
pub tags: String,
pub tpos_aln: Vec<u64>,
pub qpos_aln: Vec<u64>,
pub long_cigar: CigarString,
pub id: String,
pub order: u64,
pub contained: bool,
}
impl PafRecord {
pub fn new(line: &str) -> PafResult<PafRecord> {
let t: Vec<&str> = line.split_ascii_whitespace().collect();
assert!(t.len() >= 12); let mut tags = "".to_string();
let mut cigar = CigarString(vec![]);
for token in t.iter().skip(12) {
assert!(PAF_TAG.is_match(token));
let caps = PAF_TAG.captures(token).unwrap();
let tag = &caps[1];
let value = &caps[3];
if tag == "cg" && cigar.len() == 0 {
log::trace!("parsing cigar of length: {}", value.len());
cigar =
CigarString::try_from(value.as_bytes()).expect("Unable to parse cigar string.");
} else {
tags.push('\t');
tags.push_str(token);
}
}
let rec = PafRecord {
q_name: t[0].to_string(),
q_len: t[1].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
q_st: t[2].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
q_en: t[3].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
strand: t[4].parse::<char>().map_err(|_| Error::ParsePafColumn {})?,
t_name: t[5].to_string(),
t_len: t[6].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
t_st: t[7].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
t_en: t[8].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
nmatch: t[9].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
aln_len: t[10].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
mapq: t[11].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
cigar,
tags,
tpos_aln: Vec::new(),
qpos_aln: Vec::new(),
long_cigar: CigarString(Vec::new()),
id: "".to_string(),
order: 0,
contained: false,
};
Ok(rec)
}
#[must_use]
pub fn small_copy(&self) -> PafRecord {
PafRecord {
q_name: self.q_name.clone(),
q_len: self.q_len,
q_st: self.q_st,
q_en: self.q_en,
strand: self.strand,
t_name: self.t_name.clone(),
t_len: self.t_len,
t_st: self.t_st,
t_en: self.t_en,
nmatch: self.nmatch,
aln_len: self.aln_len,
mapq: self.mapq,
cigar: CigarString(Vec::new()),
tags: self.tags.clone(),
tpos_aln: Vec::new(),
qpos_aln: Vec::new(),
long_cigar: CigarString(Vec::new()),
id: self.id.clone(),
order: self.order,
contained: self.contained,
}
}
pub fn get_query_as_region(&self) -> bed::Region {
bed::Region {
name: self.q_name.clone(),
st: self.q_st,
en: self.q_en,
..Default::default()
}
}
pub fn get_target_as_region(&self) -> bed::Region {
bed::Region {
name: self.t_name.clone(),
st: self.t_st,
en: self.t_en,
..Default::default()
}
}
pub fn make_long_cigar(&mut self) {
let mut long_cigar = Vec::new();
for opt in self.cigar.into_iter() {
let opt_len = opt.len() as u64;
for _i in 0..opt_len {
long_cigar.push(update_cigar_opt_len(opt, 1));
}
}
self.long_cigar = CigarString(long_cigar);
}
pub fn aligned_pairs(&mut self) {
self.remove_trailing_indels();
let mut t_pos = self.t_st as i64 - 1;
let mut q_pos = self.q_st as i64 - 1;
let mut long_cigar = Vec::new();
self.tpos_aln = Vec::new();
self.qpos_aln = Vec::new();
if self.strand == '-' {
q_pos = self.q_en as i64; }
for opt in self.cigar.into_iter() {
let moves_t = consumes_reference(opt);
let moves_q = consumes_query(opt);
let opt_len = opt.len() as u64;
for _i in 0..opt_len {
long_cigar.push(update_cigar_opt_len(opt, 1));
if moves_t {
t_pos += 1;
}
if moves_q && self.strand == '+' {
q_pos += 1;
}
if moves_q && self.strand == '-' {
q_pos -= 1;
}
self.tpos_aln.push(t_pos as u64);
self.qpos_aln.push(q_pos as u64);
}
}
self.long_cigar = CigarString(long_cigar);
}
pub fn tpos_to_idx(&self, tpos: u64) -> Result<usize, usize> {
let idx = self.tpos_aln.binary_search(&tpos)?;
Ok(idx)
}
pub fn tpos_to_idx_match(&self, qpos: u64, search_right: bool) -> Result<usize, usize> {
let mut idx = self.tpos_to_idx(qpos)?;
let max_idx = self.long_cigar.len();
if search_right {
while idx < max_idx && !matches!(self.long_cigar[idx], Match(_) | Diff(_) | Equal(_)) {
idx += 1;
}
} else {
while idx > 0 && !matches!(self.long_cigar[idx], Match(_) | Diff(_) | Equal(_)) {
idx -= 1;
}
}
Ok(idx)
}
pub fn qpos_to_idx(&self, qpos: u64) -> Result<usize, usize> {
let idx = if self.strand == '-' {
self.qpos_aln
.binary_search_by(|probe| probe.cmp(&qpos).reverse())?
} else {
self.qpos_aln.binary_search(&qpos)?
};
Ok(idx)
}
pub fn qpos_to_idx_match(&self, qpos: u64, search_right: bool) -> Result<usize, usize> {
let mut idx = self.qpos_to_idx(qpos)?;
let max_idx = self.long_cigar.len();
if (search_right && self.strand == '+') || (!search_right && self.strand == '-') {
while idx < max_idx && !matches!(self.long_cigar[idx], Match(_) | Diff(_) | Equal(_)) {
idx += 1;
}
} else {
while idx > 0 && !matches!(self.long_cigar[idx], Match(_) | Diff(_) | Equal(_)) {
idx -= 1;
}
}
Ok(idx)
}
pub fn subset_cigar(&self, start_idx: usize, end_idx: usize) -> CigarString {
let mut new_cigar = vec![];
for opt in &self.long_cigar.0[start_idx..end_idx + 1] {
new_cigar.push(*opt);
}
CigarString(new_cigar)
}
pub fn collapse_long_cigar(cigar: &CigarString) -> CigarString {
let mut rtn = Vec::new();
let mut pre_opt = cigar.0[0];
let mut pre_len = 1;
let mut idx = 1;
while idx < cigar.len() {
let cur_opt = cigar.0[idx];
if std::mem::discriminant(&cur_opt) == std::mem::discriminant(&pre_opt) {
pre_len += 1;
} else {
rtn.push(update_cigar_opt_len(&pre_opt, pre_len));
pre_opt = cur_opt;
pre_len = 1;
}
idx += 1;
}
rtn.push(update_cigar_opt_len(&pre_opt, pre_len));
CigarString(rtn)
}
pub fn paf_overlaps_rgn(&self, rgn: &bed::Region) -> bool {
if self.t_name != rgn.name {
return false;
}
self.t_en > rgn.st && self.t_st < rgn.en
}
pub fn infer_n_bases(&mut self) -> (u64, u64, u64, u64) {
let mut t_bases = 0;
let mut q_bases = 0;
let mut n_matches = 0;
let mut aln_len = 0;
for opt in self.cigar.into_iter() {
if consumes_reference(opt) {
t_bases += opt.len()
}
if consumes_query(opt) {
q_bases += opt.len()
}
if is_match(opt) {
n_matches += opt.len()
}
aln_len += opt.len();
}
(
t_bases as u64,
q_bases as u64,
n_matches as u64,
aln_len as u64,
)
}
pub fn remove_trailing_indels(&mut self) {
let cigar_len = self.cigar.len();
let mut st_opt = *self.cigar.first().unwrap();
let mut remove_st_t = 0;
let mut remove_st_q = 0;
let mut remove_st_opts = 0;
let mut removed_st_opts = Vec::new();
while matches!(st_opt, Ins(_) | Del(_)) {
if matches!(st_opt, Del(_)) {
remove_st_t += st_opt.len();
remove_st_q += 1;
} else {
remove_st_q += st_opt.len();
}
remove_st_opts += 1;
removed_st_opts.push(st_opt);
if remove_st_opts < cigar_len {
st_opt = self.cigar[remove_st_opts];
} else {
break;
}
}
if removed_st_opts.len() > 1 {
for i in 0..(removed_st_opts.len() - 1) {
let pre_opt = removed_st_opts[i];
let cur_opt = removed_st_opts[i + 1];
if (matches!(pre_opt, Del(_)) && matches!(cur_opt, Ins(_)))
|| (matches!(pre_opt, Ins(_)) && matches!(cur_opt, Del(_)))
{
remove_st_t += 1;
remove_st_q -= 1;
}
}
}
let mut en_opt = *self.cigar.last().unwrap();
let mut remove_en_t = 0;
let mut remove_en_q = 0;
let mut remove_en_opts = 0;
let mut removed_en_opts = Vec::new();
while matches!(en_opt, Ins(_) | Del(_)) {
if matches!(en_opt, Del(_)) {
remove_en_t += en_opt.len();
} else {
remove_en_q += en_opt.len();
}
remove_en_opts += 1;
removed_en_opts.push(en_opt);
if cigar_len - remove_en_opts > 0 {
en_opt = self.cigar[cigar_len - 1 - remove_en_opts];
} else {
break;
}
}
if remove_en_opts > 0 || remove_st_opts > 0 {
self.id += &format!(
"_TO.{}.{}",
CigarString(removed_st_opts.clone()),
CigarString(removed_en_opts.clone())
);
}
if remove_en_opts > 0 || remove_st_opts > 0 {
log::debug!(
"\nRemoved {} leading and {} trailing indels:\n{}\n{}\ntarget changes:{},{}\nquery changes: {},{}\n{}:{}-{}\n{}:{}-{}",
remove_st_opts,
remove_en_opts,
CigarString(removed_st_opts),
CigarString(removed_en_opts),
remove_st_t,
remove_en_t,
remove_st_q,
remove_en_q,
self.q_name,
self.q_st,
self.q_en,
self.t_name,
self.t_st,
self.t_en,
);
}
self.cigar = CigarString(self.cigar.0[remove_st_opts..].to_vec());
self.cigar.0.truncate(self.cigar.len() - remove_en_opts);
self.t_st += remove_st_t as u64;
self.t_en -= remove_en_t as u64;
if self.strand == '-' {
std::mem::swap(&mut remove_st_q, &mut remove_en_q);
}
self.q_st += remove_st_q as u64;
self.q_en -= remove_en_q as u64;
if self.cigar.len() > 0 {
let st_opt = *self.cigar.first().unwrap();
let en_opt = *self.cigar.last().unwrap();
if matches!(st_opt, Ins(_) | Del(_)) || matches!(en_opt, Ins(_) | Del(_)) {
eprintln!("Why are there still indels?\n{}", self);
}
}
self.check_integrity().unwrap();
}
pub fn truncate_record_by_query(&mut self, new_q_st: u64, new_q_en: u64) {
assert!(new_q_st >= self.q_st, "New start is less than old start.");
assert!(new_q_en <= self.q_en, "New end is greater than old end.");
self.make_long_cigar(); let mut aln_st = self.qpos_to_idx_match(new_q_st, true).unwrap();
let mut aln_en = self.qpos_to_idx_match(new_q_en - 1, false).unwrap();
let new_new_q_st = self.qpos_aln[aln_st];
let new_new_q_en = self.qpos_aln[aln_en] + 1;
if aln_st > aln_en {
std::mem::swap(&mut aln_st, &mut aln_en);
}
let new_t_st = self.tpos_aln[aln_st];
let new_t_en = self.tpos_aln[aln_en] + 1;
self.long_cigar = self.subset_cigar(aln_st, aln_en);
self.cigar = PafRecord::collapse_long_cigar(&self.long_cigar);
self.t_st = new_t_st;
self.t_en = new_t_en;
self.q_st = new_new_q_st;
self.q_en = new_new_q_en;
self.remove_trailing_indels();
self.check_integrity().unwrap();
}
pub fn check_integrity(&mut self) -> PafResult<()> {
let (t_bases, q_bases, nmatch, aln_len) = self.infer_n_bases();
if self.t_en - self.t_st != t_bases {
return Err(Error::PafParseCigar {
msg: format!(
"target bases {} from cigar does not equal {}-{}={}\n{}\n",
t_bases,
self.t_en,
self.t_st,
self.t_en - self.t_st,
self
),
});
}
if self.q_en - self.q_st != q_bases {
return Err(Error::PafParseCigar {
msg: format!(
"query bases {} from cigar does not equal {}-{}={}\n{}\n",
q_bases,
self.q_en,
self.q_st,
self.q_en - self.q_st,
self
),
});
}
self.nmatch = nmatch;
self.aln_len = aln_len;
Ok(())
}
pub fn to_sam_string(&self, reader: Option<&faidx::Reader>) -> String {
let mut clip_char = 'H';
let seq = match reader {
Some(reader) => {
let seq = getfasta::fetch_fasta(
reader,
&self.q_name,
0, self.q_len as usize,
);
clip_char = 'S';
let seq = if self.strand == '-' {
revcomp(seq)
} else {
seq
};
std::str::from_utf8(&seq).unwrap().to_string()
}
None => "*".to_string(),
};
let qual = "*".to_string();
let flag = if self.strand == '-' { 16 } else { 0 };
let mut leading_clip = if self.q_st > 0 {
format!("{}{}", self.q_st, clip_char)
} else {
"".to_string()
};
let mut trailing_clip = if self.q_len - self.q_en > 0 {
format!("{}{}", self.q_len - self.q_en, clip_char)
} else {
"".to_string()
};
if self.strand == '-' {
std::mem::swap(&mut leading_clip, &mut trailing_clip);
}
let o_cigar = format!("{}{}{}", leading_clip, self.cigar, trailing_clip);
format!(
"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
self.q_name,
flag,
self.t_name,
self.t_st + 1,
self.mapq,
o_cigar,
"*",
0,
0,
seq,
qual
)
}
}
impl fmt::Display for PafRecord {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tid:Z:{}\tcg:Z:{}",
self.q_name,
self.q_len,
self.q_st,
self.q_en,
self.strand,
self.t_name,
self.t_len,
self.t_st,
self.t_en,
self.nmatch,
self.aln_len,
self.mapq,
self.id,
self.cigar,
)
}
}
pub fn consumes_reference(cigar_opt: &Cigar) -> bool {
matches!(
cigar_opt,
Match(_i) | Del(_i) | RefSkip(_i) | Diff(_i) | Equal(_i)
)
}
pub fn consumes_query(cigar_opt: &Cigar) -> bool {
matches!(
cigar_opt,
Match(_i) | Ins(_i) | SoftClip(_i) | Diff(_i) | Equal(_i)
)
}
pub fn is_match(cigar_opt: &Cigar) -> bool {
matches!(cigar_opt, Match(_i) | Diff(_i) | Equal(_i))
}
pub fn update_cigar_opt_len(opt: &Cigar, new_opt_len: u32) -> Cigar {
match opt {
Match(_) => Match(new_opt_len),
Ins(_) => Ins(new_opt_len),
Del(_) => Del(new_opt_len),
RefSkip(_) => RefSkip(new_opt_len),
HardClip(_) => HardClip(new_opt_len),
SoftClip(_) => SoftClip(new_opt_len),
Pad(_) => Pad(new_opt_len),
Equal(_) => Equal(new_opt_len),
Diff(_) => Diff(new_opt_len),
}
}
pub fn cigar_from_str(text: &str) -> PafResult<CigarString> {
let bytes = text.as_bytes();
let mut inner = Vec::new();
let mut i = 0;
let text_len = text.len();
while i < text_len {
let mut j = i;
while j < text_len && bytes[j].is_ascii_digit() {
j += 1;
}
let n = u32::from_str(&text[i..j]).map_err(|_| Error::PafParseCigar {
msg: "expected integer".to_owned(),
})?;
let op = &text[j..j + 1];
inner.push(match op {
"M" => Cigar::Match(n),
"I" => Cigar::Ins(n),
"D" => Cigar::Del(n),
"N" => Cigar::RefSkip(n),
"H" => Cigar::HardClip(n),
"S" => Cigar::SoftClip(n),
"P" => Cigar::Pad(n),
"=" => Cigar::Equal(n),
"X" => Cigar::Diff(n),
op => {
return Err(Error::PafParseCigar {
msg: format!("Cannot parse opt: {}", op),
})
}
});
i = j + 1;
}
Ok(CigarString(inner))
}
pub fn cigar_swap_target_query(cigar: &CigarString, strand: char) -> CigarString {
let mut new_cigar = Vec::new();
for opt in cigar.into_iter() {
let new_opt = match opt {
Ins(l) => Del(*l),
Del(l) => Ins(*l),
_ => *opt,
};
new_cigar.push(new_opt);
}
if strand == '-' {
new_cigar.reverse();
}
CigarString(new_cigar)
}
pub fn paf_swap_query_and_target(paf: &PafRecord) -> PafRecord {
let mut flipped = paf.clone();
flipped.t_name = paf.q_name.clone();
flipped.t_len = paf.q_len;
flipped.t_st = paf.q_st;
flipped.t_en = paf.q_en;
flipped.q_name = paf.t_name.clone();
flipped.q_len = paf.t_len;
flipped.q_st = paf.t_st;
flipped.q_en = paf.t_en;
std::mem::swap(&mut flipped.qpos_aln, &mut flipped.tpos_aln);
flipped.cigar = cigar_swap_target_query(&paf.cigar, paf.strand);
flipped.long_cigar = cigar_swap_target_query(&paf.long_cigar, paf.strand);
if !flipped.tpos_aln.is_empty() {
flipped.aligned_pairs();
}
flipped
}
pub fn make_fake_paf_rec() -> PafRecord {
let mut rtn = PafRecord::new("Q 10 2 10 - T 20 12 20 3 9 60 cg:Z:4M1I1D3=").unwrap();
rtn.aligned_pairs();
rtn
}
pub fn cs_to_cigar(cs: &str) -> PafResult<CigarString> {
let bytes = cs.as_bytes();
let length = bytes.len();
let mut i = 0;
let mut cigar = vec![];
while i < length {
let cs_opt = bytes[i];
let mut l = 0;
i += 1;
let opt = match cs_opt {
b'=' => {
while let b'A' | b'C' | b'G' | b'T' | b'N' = bytes[i] {
i += 1;
l += 1;
if i == length {
break;
}
}
Cigar::Equal(l)
}
b':' => {
let mut j = i;
while j < length && bytes[j].is_ascii_digit() {
j += 1;
}
let str = cs[i..j].to_string();
l = u32::from_str(&cs[i..j]).map_err(|_| Error::ParseIntError {
msg: format!("Expected integer, got {}", str),
})?;
i += j - 1;
Cigar::Equal(l)
}
b'*' => {
i += 2;
Cigar::Diff(1)
}
b'+' | b'-' => {
while let b'a' | b'c' | b'g' | b't' | b'n' = bytes[i] {
i += 1;
l += 1;
if i == length {
break;
}
}
match cs_opt {
b'+' => Cigar::Ins(l),
b'-' => Cigar::Del(l),
_ => panic!("should be impossible + or - needed"),
}
}
b'~' => {
return Err(Error::PafParseCS {
msg: "Splice operations not yet supported.".to_string(),
});
}
_ => {
return Err(Error::PafParseCS {
msg: format!("Unexpected operator in the cs string: {}", cs_opt as char),
});
}
};
cigar.push(opt);
}
Ok(CigarString(cigar))
}