1use super::bed;
2use super::getfasta;
3use super::myio;
4use super::trim_overlap::trim_overlapping_pafs;
5use bio::alphabets::dna::revcomp;
6use core::fmt;
7use itertools::Itertools;
8use natord;
9use rust_htslib::bam::record::Cigar::*;
10use rust_htslib::bam::record::CigarString;
11use rust_htslib::bam::record::*;
12use rust_htslib::faidx;
13use std::collections::{HashMap, HashSet};
14use std::convert::TryFrom;
15use std::io::BufRead;
16use std::str::FromStr;
17
18#[derive(Debug)]
21pub enum Error {
22 PafParseCigar { msg: String },
23 PafParseCS { msg: String },
24 ParseIntError { msg: String },
25 ParsePafColumn {},
26}
27type PafResult<T> = Result<T, crate::paf::Error>;
28
29#[derive(Debug)]
30pub struct Paf {
31 pub records: Vec<PafRecord>,
32 }
34
35impl Default for Paf {
36 fn default() -> Self {
37 Self::new()
38 }
39}
40
41impl Paf {
42 pub fn new() -> Paf {
43 Paf {
44 records: Vec::new(),
45 }
47 }
48 pub fn from_file(file_name: &str) -> Paf {
59 let paf_file = myio::reader(file_name);
60 let mut paf = Paf::new();
61 for (index, line) in paf_file.lines().enumerate() {
63 log::trace!("{:?}", line);
64 match PafRecord::new(&line.unwrap()) {
65 Ok(mut rec) => {
66 rec.check_integrity().unwrap();
67 paf.records.push(rec);
68 }
69 Err(_) => eprintln!("\nUnable to parse PAF record. Skipping line {}", index + 1),
70 }
71 log::debug!("Read PAF record number: {}", index + 1);
72 }
73 paf
74 }
75
76 pub fn query_name_map(&mut self) -> HashMap<String, Vec<&PafRecord>> {
77 let mut records_by_contig = HashMap::new();
78 for rec in &self.records {
79 (*records_by_contig
80 .entry(rec.q_name.clone())
81 .or_insert_with(Vec::new))
82 .push(rec);
83 }
84 records_by_contig
85 }
86
87 pub fn filter_aln_pairs(&mut self, paired_len: u64) {
88 let mut dict = HashMap::new();
89 for rec in self.records.iter_mut() {
90 let aln_bp = dict
91 .entry((rec.t_name.clone(), rec.q_name.clone()))
92 .or_insert(0_u64);
93 *aln_bp += rec.t_en - rec.t_st;
94 }
95 self.records.retain(|rec| {
96 paired_len < *dict.get(&(rec.t_name.clone(), rec.q_name.clone())).unwrap()
97 });
98 }
99
100 pub fn filter_query_len(&mut self, min_query_len: u64) {
101 self.records.retain(|rec| rec.q_len > min_query_len);
102 }
103
104 pub fn filter_aln_len(&mut self, min_aln_len: u64) {
106 self.records.retain(|rec| rec.t_en - rec.t_st > min_aln_len);
107 }
108
109 pub fn orient(&mut self) {
111 let mut orient_order_dict = HashMap::new();
112 let mut t_names = HashSet::new();
113 for rec in &self.records {
116 let (orient, total_bp, order) = orient_order_dict
117 .entry((rec.t_name.clone(), rec.q_name.clone()))
118 .or_insert((0_i64, 0_u64, 0_u64));
119 if rec.strand == '-' {
121 *orient -= (rec.q_en - rec.q_st) as i64;
122 } else {
123 *orient += (rec.q_en - rec.q_st) as i64;
124 }
125 let weight = rec.t_en - rec.t_st;
127 *total_bp += weight;
128 *order += weight * (rec.t_st + rec.t_en) / 2;
129 t_names.insert(rec.t_name.clone());
131 }
132
133 for rec in &mut self.records {
135 let (orient, total_bp, order) = orient_order_dict
137 .get(&(rec.t_name.clone(), rec.q_name.clone()))
138 .unwrap();
139 rec.order = *order / *total_bp;
140
141 if *orient < 0 {
143 rec.q_name = format!("{}-", rec.q_name);
144 let new_st = rec.q_len - rec.q_en;
145 let new_en = rec.q_len - rec.q_st;
146 rec.q_st = new_st;
147 rec.q_en = new_en;
148 rec.strand = if rec.strand == '+' { '-' } else { '+' };
149 } else {
150 rec.q_name = format!("{}+", rec.q_name);
151 }
152 }
153 }
154
155 pub fn scaffold(&mut self, spacer_size: u64) {
157 self.records.sort_by(|a, b| {
159 a.t_name
160 .cmp(&b.t_name) .then(a.order.cmp(&b.order)) .then(a.q_st.cmp(&b.q_st)) });
164
165 for (_t_name, t_recs) in &self.records.iter_mut().group_by(|rec| rec.t_name.clone()) {
167 let mut t_recs: Vec<&mut PafRecord> = t_recs.collect();
168 t_recs.sort_by(|a, b| {
170 a.order
171 .cmp(&b.order) .then(a.q_st.cmp(&b.q_st)) });
174
175 let scaffold_name = t_recs
177 .iter()
178 .map(|rec| rec.q_name.clone())
179 .unique()
180 .collect::<Vec<String>>()
181 .join("::");
182
183 let mut scaffold_len = 0_u64;
184 for (_q_name, q_recs) in &t_recs.iter_mut().group_by(|rec| rec.q_name.clone()) {
185 let q_recs: Vec<&mut &mut PafRecord> = q_recs.collect();
186 let q_min = q_recs.iter().map(|rec| rec.q_st).min().unwrap_or(0);
187 let q_max = q_recs.iter().map(|rec| rec.q_en).max().unwrap_or(0);
188 let added_q_bases = q_max - q_min;
189 for rec in q_recs {
190 rec.q_st = rec.q_st - q_min + scaffold_len;
191 rec.q_en = rec.q_en - q_min + scaffold_len;
192 }
193 scaffold_len += added_q_bases + spacer_size;
194 }
195 scaffold_len -= spacer_size;
197
198 for rec in t_recs {
199 rec.q_name = scaffold_name.clone();
200 rec.q_len = scaffold_len;
201 }
202 }
203 }
204
205 pub fn overlapping_paf_recs(
207 &mut self,
208 match_score: i32,
209 diff_score: i32,
210 indel_score: i32,
211 remove_contained: bool,
212 ) {
213 for rec in &mut self.records {
215 rec.remove_trailing_indels();
216 }
217
218 let mut overlap_pairs = Vec::new();
219 self.records.sort_by_key(|rec| rec.q_name.clone());
220 let mut contained_indexes = vec![false; self.records.len()];
221
222 if self.records.len() < 2 {
224 return;
225 }
226
227 for i in 0..(self.records.len() - 1) {
228 let rec1 = &self.records[i];
229 let rgn1 = rec1.get_query_as_region();
230 let mut j = i + 1;
231 while j < self.records.len() && rec1.q_name == self.records[j].q_name {
232 let rec2 = &self.records[j];
233 let rgn2 = rec2.get_query_as_region();
234 let overlap = bed::get_overlap(&rgn1, &rgn2);
236 if overlap < 1 {
238 j += 1;
239 continue;
240 } else if overlap == (rec2.q_en - rec2.q_st) {
241 contained_indexes[j] = true;
242 log::debug!("{}\n^is contained in another alignment", rec2);
243 } else if overlap == (rec1.q_en - rec1.q_st) {
244 contained_indexes[i] = true;
245 log::debug!("{}\n^is contained in another alignment", rec1);
246 } else {
247 if rec1.q_st <= rec2.q_st {
249 overlap_pairs.push((overlap, i, j));
250 } else {
251 overlap_pairs.push((overlap, j, i));
252 }
253 }
254 j += 1;
256 }
257 }
258 overlap_pairs.sort_by_key(|rec| u64::MAX - rec.0);
259 log::debug!("{} overlapping pairs found", overlap_pairs.len());
260 let mut q_seen: HashSet<String> = HashSet::new();
261 let mut unseen = 0;
262 for (_overlap, i, j) in overlap_pairs {
263 let mut left = self.records[i].clone();
264 let mut right = self.records[j].clone();
265 let q_name = left.q_name.clone();
266 if !q_seen.contains(&q_name) {
269 trim_overlapping_pafs(&mut left, &mut right, match_score, diff_score, indel_score);
270 log::trace!("{}", left);
271 log::trace!("{}", right);
272 self.records[i] = left;
273 self.records[j] = right;
274 q_seen.insert(q_name);
275 } else {
276 unseen += 1;
277 }
278 }
279
280 if unseen > 0 {
281 self.overlapping_paf_recs(match_score, diff_score, indel_score, remove_contained);
283 } else if remove_contained {
284 let n_to_remove = contained_indexes.iter().filter(|&x| *x).count();
285 log::info!("Removing {} contained alignments.", n_to_remove);
286 log::info!("{} total alignments.", self.records.len());
287 let mut new_records = Vec::new();
288 assert!(self.records.len() == contained_indexes.len());
289 for (i, rec) in self.records.iter().enumerate() {
290 if !contained_indexes[i] {
291 new_records.push(rec.clone());
292 }
293 }
294 self.records = new_records;
295 log::info!("{} total alignments.", self.records.len());
296 }
299 }
300
301 pub fn sam_header(&self) -> String {
313 let mut header = "@HD\tVN:1.6\n".to_string();
322
323 let mut names: Vec<(String, u64)> = self
325 .records
326 .iter()
327 .map(|rec| (rec.t_name.clone(), rec.t_len))
328 .unique()
329 .collect();
330
331 names.sort_by(|a, b| natord::compare(&a.0, &b.0));
332 for (name, length) in names {
333 header.push_str(&format!("@SQ\tSN:{}\tLN:{}\n", name, length));
334 }
335 header.push_str("@PG\tID:rustybam\tPN:rustybam");
336 header
337 }
338}
339
340#[derive(Debug, Clone, Copy)]
344pub enum CsOp {
345 Matches(u32),
347 MatchSeq(u32, u32),
349 Mismatch(u8, u8),
351 Insertion(u32, u32),
353 Deletion(u32, u32),
355}
356
357impl CsOp {
358 pub fn trim(&self, skip: u32, keep: u32) -> CsOp {
361 match self {
362 CsOp::Matches(_) => CsOp::Matches(keep),
363 CsOp::MatchSeq(off, _) => CsOp::MatchSeq(off + skip, keep),
364 CsOp::Mismatch(r, q) => {
365 debug_assert!(skip == 0 && keep == 1);
366 CsOp::Mismatch(*r, *q)
367 }
368 CsOp::Insertion(off, _) => CsOp::Insertion(off + skip, keep),
369 CsOp::Deletion(off, _) => CsOp::Deletion(off + skip, keep),
370 }
371 }
372}
373
374#[derive(Debug, Clone)]
377pub struct CsOps {
378 pub ops: Vec<CsOp>,
379 pub seq_data: Vec<u8>,
380}
381
382impl CsOps {
383 #[inline]
385 pub fn seq(&self, offset: u32, len: u32) -> &[u8] {
386 &self.seq_data[offset as usize..(offset + len) as usize]
387 }
388
389 #[inline]
391 pub fn op_seq(&self, op: &CsOp) -> Option<&[u8]> {
392 match op {
393 CsOp::MatchSeq(off, len)
394 | CsOp::Insertion(off, len)
395 | CsOp::Deletion(off, len) => {
396 Some(&self.seq_data[*off as usize..(*off + *len) as usize])
397 }
398 _ => None,
399 }
400 }
401
402 pub fn to_cs_string(&self) -> String {
404 let mut buf = String::new();
405 for op in &self.ops {
406 match op {
407 CsOp::Matches(n) => {
408 buf.push(':');
409 let mut tmp = itoa::Buffer::new();
410 buf.push_str(tmp.format(*n));
411 }
412 CsOp::MatchSeq(off, len) => {
413 buf.push('=');
414 buf.push_str(std::str::from_utf8(self.seq(*off, *len)).unwrap());
415 }
416 CsOp::Mismatch(r, q) => {
417 buf.push('*');
418 buf.push(*r as char);
419 buf.push(*q as char);
420 }
421 CsOp::Insertion(off, len) => {
422 buf.push('+');
423 buf.push_str(std::str::from_utf8(self.seq(*off, *len)).unwrap());
424 }
425 CsOp::Deletion(off, len) => {
426 buf.push('-');
427 buf.push_str(std::str::from_utf8(self.seq(*off, *len)).unwrap());
428 }
429 }
430 }
431 buf
432 }
433}
434
435impl fmt::Display for CsOps {
436 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
437 f.write_str(&self.to_cs_string())
438 }
439}
440
441#[derive(Debug, Clone)]
442pub struct PafRecord {
443 pub q_name: String,
444 pub q_len: u64,
445 pub q_st: u64,
446 pub q_en: u64,
447 pub strand: char,
448 pub t_name: String,
449 pub t_len: u64,
450 pub t_st: u64,
451 pub t_en: u64,
452 pub nmatch: u64,
453 pub aln_len: u64,
454 pub mapq: u64,
455 pub cigar: CigarString,
456 pub cs_ops: Option<CsOps>,
460 pub tags: String,
461 pub id: String,
462 pub order: u64,
463 pub contained: bool,
464}
465
466impl PafRecord {
467 pub fn new(line: &str) -> PafResult<PafRecord> {
476 let t: Vec<&str> = line.split_ascii_whitespace().collect();
477 assert!(t.len() >= 12); let mut tags = "".to_string();
482 let mut cs_idx: Option<usize> = None;
483 let mut cg_idx: Option<usize> = None;
484 for (i, token) in t.iter().enumerate().skip(12) {
485 debug_assert!(
486 token.len() >= 5
487 && token.as_bytes()[2] == b':'
488 && token.as_bytes()[4] == b':',
489 "Malformed PAF tag: {}",
490 token
491 );
492 let tag = &token[..2];
493 if tag == "cs" {
494 cs_idx = Some(i);
495 } else if tag == "cg" {
496 cg_idx = Some(i);
497 } else {
498 tags.push('\t');
499 tags.push_str(token);
500 }
501 }
502
503 let mut cigar = CigarString(vec![]);
506 let mut cs_ops: Option<CsOps> = None;
507 if let Some(idx) = cs_idx {
508 let value = &t[idx][5..];
509 let (parsed_cigar, parsed_ops) = parse_cs_string(value)?;
510 cigar = parsed_cigar;
511 cs_ops = Some(parsed_ops);
512 } else if let Some(idx) = cg_idx {
513 let value = &t[idx][5..];
514 log::trace!("parsing cigar of length: {}", value.len());
515 cigar =
516 CigarString::try_from(value.as_bytes()).expect("Unable to parse cigar string.");
517 }
518
519 let rec = PafRecord {
521 q_name: t[0].to_string(),
522 q_len: t[1].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
523 q_st: t[2].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
524 q_en: t[3].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
525 strand: t[4].parse::<char>().map_err(|_| Error::ParsePafColumn {})?,
526 t_name: t[5].to_string(),
527 t_len: t[6].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
528 t_st: t[7].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
529 t_en: t[8].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
530 nmatch: t[9].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
531 aln_len: t[10].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
532 mapq: t[11].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
533 cigar,
534 cs_ops,
535 tags,
536 id: "".to_string(),
537 order: 0,
538 contained: false,
539 };
540 Ok(rec)
541 }
542
543 #[must_use]
544 pub fn small_copy(&self) -> PafRecord {
545 PafRecord {
546 q_name: self.q_name.clone(),
547 q_len: self.q_len,
548 q_st: self.q_st,
549 q_en: self.q_en,
550 strand: self.strand,
551 t_name: self.t_name.clone(),
552 t_len: self.t_len,
553 t_st: self.t_st,
554 t_en: self.t_en,
555 nmatch: self.nmatch,
556 aln_len: self.aln_len,
557 mapq: self.mapq,
558 cigar: CigarString(Vec::new()),
559 cs_ops: None,
560 tags: self.tags.clone(),
561 id: self.id.clone(),
562 order: self.order,
563 contained: self.contained,
564 }
565 }
566
567 pub fn get_query_as_region(&self) -> bed::Region {
569 bed::Region {
570 name: self.q_name.clone(),
571 st: self.q_st,
572 en: self.q_en,
573 ..Default::default()
574 }
575 }
576
577 pub fn get_target_as_region(&self) -> bed::Region {
589 bed::Region {
590 name: self.t_name.clone(),
591 st: self.t_st,
592 en: self.t_en,
593 ..Default::default()
594 }
595 }
596
597 pub fn collapse_long_cigar(cigar: &CigarString) -> CigarString {
598 let mut rtn = Vec::new();
599 let mut pre_opt = cigar.0[0];
600 let mut pre_len = 1;
601 let mut idx = 1;
602 while idx < cigar.len() {
603 let cur_opt = cigar.0[idx];
604 if std::mem::discriminant(&cur_opt) == std::mem::discriminant(&pre_opt) {
605 pre_len += 1;
606 } else {
607 rtn.push(update_cigar_opt_len(&pre_opt, pre_len));
608 pre_opt = cur_opt;
609 pre_len = 1;
610 }
611 idx += 1;
612 }
613 rtn.push(update_cigar_opt_len(&pre_opt, pre_len));
614 CigarString(rtn)
615 }
616
617 pub fn paf_overlaps_rgn(&self, rgn: &bed::Region) -> bool {
618 if self.t_name != rgn.name {
619 return false;
620 }
621 self.t_en > rgn.st && self.t_st < rgn.en
622 }
623
624 pub fn infer_n_bases(&mut self) -> (u64, u64, u64, u64) {
627 let mut t_bases = 0;
628 let mut q_bases = 0;
629 let mut n_matches = 0;
630 let mut aln_len = 0;
631 for opt in self.cigar.into_iter() {
632 if consumes_reference(opt) {
633 t_bases += opt.len()
634 }
635 if consumes_query(opt) {
636 q_bases += opt.len()
637 }
638 if is_match(opt) {
639 n_matches += opt.len()
640 }
641 aln_len += opt.len();
642 }
643 (
644 t_bases as u64,
645 q_bases as u64,
646 n_matches as u64,
647 aln_len as u64,
648 )
649 }
650
651 pub fn remove_trailing_indels(&mut self) {
652 let cigar_len = self.cigar.len();
656
657 let mut st_opt = *self.cigar.first().unwrap();
659 let mut remove_st_t = 0;
660 let mut remove_st_q = 0;
661 let mut remove_st_opts = 0;
662 let mut removed_st_opts = Vec::new();
663 while matches!(st_opt, Ins(_) | Del(_)) {
664 if matches!(st_opt, Del(_)) {
665 remove_st_t += st_opt.len();
667 remove_st_q += 1;
669 } else {
670 remove_st_q += st_opt.len();
671 }
675 remove_st_opts += 1;
676 removed_st_opts.push(st_opt);
677 if remove_st_opts < cigar_len {
678 st_opt = self.cigar[remove_st_opts];
679 } else {
680 break;
681 }
682 }
683
684 if removed_st_opts.len() > 1 {
686 for i in 0..(removed_st_opts.len() - 1) {
687 let pre_opt = removed_st_opts[i];
688 let cur_opt = removed_st_opts[i + 1];
689 if (matches!(pre_opt, Del(_)) && matches!(cur_opt, Ins(_)))
690 || (matches!(pre_opt, Ins(_)) && matches!(cur_opt, Del(_)))
691 {
692 remove_st_t += 1;
693 remove_st_q -= 1;
694 }
695 }
696 }
697
698 let mut en_opt = *self.cigar.last().unwrap();
700 let mut remove_en_t = 0;
701 let mut remove_en_q = 0;
702 let mut remove_en_opts = 0;
703 let mut removed_en_opts = Vec::new();
704 while matches!(en_opt, Ins(_) | Del(_)) {
705 if matches!(en_opt, Del(_)) {
706 remove_en_t += en_opt.len();
708 } else {
709 remove_en_q += en_opt.len();
710 }
711 remove_en_opts += 1;
712 removed_en_opts.push(en_opt);
713 if cigar_len - remove_en_opts > 0 {
714 en_opt = self.cigar[cigar_len - 1 - remove_en_opts];
715 } else {
716 break;
717 }
718 }
719
720 if remove_en_opts > 0 || remove_st_opts > 0 {
722 self.id += &format!(
723 "_TO.{}.{}",
724 CigarString(removed_st_opts.clone()),
725 CigarString(removed_en_opts.clone())
726 );
727 }
728
729 if remove_en_opts > 0 || remove_st_opts > 0 {
731 log::debug!(
732 "\nRemoved {} leading and {} trailing indels:\n{}\n{}\ntarget changes:{},{}\nquery changes: {},{}\n{}:{}-{}\n{}:{}-{}",
733 remove_st_opts,
734 remove_en_opts,
735 CigarString(removed_st_opts),
736 CigarString(removed_en_opts),
737 remove_st_t,
738 remove_en_t,
739 remove_st_q,
740 remove_en_q,
741 self.q_name,
742 self.q_st,
743 self.q_en,
744 self.t_name,
745 self.t_st,
746 self.t_en,
747 );
748 }
749
750 self.cigar = CigarString(self.cigar.0[remove_st_opts..].to_vec());
752 self.cigar.0.truncate(self.cigar.len() - remove_en_opts);
753 if let Some(ref mut cs) = self.cs_ops {
754 cs.ops = cs.ops[remove_st_opts..].to_vec();
755 cs.ops.truncate(cs.ops.len() - remove_en_opts);
756 }
758
759 self.t_st += remove_st_t as u64;
761 self.t_en -= remove_en_t as u64;
762
763 if self.strand == '-' {
765 std::mem::swap(&mut remove_st_q, &mut remove_en_q);
766 }
767 self.q_st += remove_st_q as u64;
769 self.q_en -= remove_en_q as u64;
770
771 if !self.cigar.is_empty() {
773 let st_opt = *self.cigar.first().unwrap();
774 let en_opt = *self.cigar.last().unwrap();
775 if matches!(st_opt, Ins(_) | Del(_)) || matches!(en_opt, Ins(_) | Del(_)) {
776 eprintln!("Why are there still indels?\n{}", self);
777 }
779 }
780
781 self.check_integrity().unwrap();
783 }
784
785 pub fn truncate_record_by_query(&mut self, new_q_st: u64, new_q_en: u64) {
788 assert!(new_q_st >= self.q_st, "New start is less than old start.");
790 assert!(new_q_en <= self.q_en, "New end is greater than old end.");
791
792 if new_q_st == self.q_st && new_q_en == self.q_en {
793 return;
794 }
795
796 let cs_data = self.cs_ops.take(); let mut new_cs_ops: Option<CsOps> = cs_data.as_ref().map(|cd| CsOps {
798 ops: Vec::new(),
799 seq_data: cd.seq_data.clone(), });
801
802 let mut q_pos = if self.strand == '+' { self.q_st } else { self.q_en };
803 let mut new_cigar_ops: Vec<Cigar> = Vec::new();
804 let mut t_before: u64 = 0;
805 let mut q_consumed_before: u64 = 0;
806 let mut q_consumed_in: u64 = 0;
807 let mut started = false;
808 let mut finished = false;
809
810 for (ci, op) in self.cigar.into_iter().enumerate() {
811 if finished {
812 break;
813 }
814 let op_len = op.len() as u64;
815 let moves_t = consumes_reference(op);
816 let moves_q = consumes_query(op);
817
818 if !moves_q {
819 if started {
821 new_cigar_ops.push(*op);
822 if let (Some(ref mut ncs), Some(ref cs)) = (&mut new_cs_ops, &cs_data) {
823 ncs.ops.push(cs.ops[ci]);
824 }
825 } else if moves_t {
826 t_before += op_len;
827 }
828 continue;
829 }
830
831 let (q_lo, q_hi) = if self.strand == '+' {
833 let lo = q_pos;
834 q_pos += op_len;
835 (lo, q_pos)
836 } else {
837 q_pos -= op_len;
838 (q_pos, q_pos + op_len)
839 };
840
841 let overlap_start = std::cmp::max(q_lo, new_q_st);
843 let overlap_end = std::cmp::min(q_hi, new_q_en);
844 if overlap_start >= overlap_end {
845 if !started {
847 if moves_t {
848 t_before += op_len;
849 }
850 q_consumed_before += op_len;
851 }
852 continue;
853 }
854
855 let skip_before_cigar = if self.strand == '+' {
857 overlap_start - q_lo
858 } else {
859 q_hi - overlap_end
860 };
861 let keep = overlap_end - overlap_start;
862
863 if !started {
864 started = true;
865 if moves_t {
866 t_before += skip_before_cigar;
867 }
868 q_consumed_before += skip_before_cigar;
869 }
870
871 q_consumed_in += keep;
872 new_cigar_ops.push(update_cigar_opt_len(op, keep as u32));
873 if let (Some(ref mut ncs), Some(ref cs)) = (&mut new_cs_ops, &cs_data) {
874 ncs.ops.push(cs.ops[ci].trim(skip_before_cigar as u32, keep as u32));
875 }
876
877 if (self.strand == '+' && overlap_end >= new_q_en)
879 || (self.strand == '-' && overlap_start <= new_q_st)
880 {
881 finished = true;
882 }
883 }
884
885 if new_cigar_ops.is_empty() {
886 self.cs_ops = new_cs_ops;
887 return;
888 }
889
890 let mut lead_t: u64 = 0;
892 let mut lead_q: u64 = 0;
893 while !new_cigar_ops.is_empty() {
894 let first = new_cigar_ops[0];
895 if matches!(first, Match(_) | Equal(_) | Diff(_)) {
896 break;
897 }
898 if consumes_reference(&first) {
899 lead_t += first.len() as u64;
900 }
901 if consumes_query(&first) {
902 lead_q += first.len() as u64;
903 }
904 new_cigar_ops.remove(0);
905 if let Some(ref mut ncs) = new_cs_ops {
906 ncs.ops.remove(0);
907 }
908 }
909 t_before += lead_t;
910 q_consumed_before += lead_q;
911 q_consumed_in -= lead_q;
912
913 let mut trail_q: u64 = 0;
915 while !new_cigar_ops.is_empty() {
916 let last = *new_cigar_ops.last().unwrap();
917 if matches!(last, Match(_) | Equal(_) | Diff(_)) {
918 break;
919 }
920 if consumes_query(&last) {
921 trail_q += last.len() as u64;
922 }
923 new_cigar_ops.pop();
924 if let Some(ref mut ncs) = new_cs_ops {
925 ncs.ops.pop();
926 }
927 }
928 q_consumed_in -= trail_q;
929
930 if new_cigar_ops.is_empty() {
931 self.cs_ops = new_cs_ops;
932 return;
933 }
934
935 self.t_st += t_before;
937 let t_bases: u64 = new_cigar_ops
938 .iter()
939 .filter(|op| consumes_reference(op))
940 .map(|op| op.len() as u64)
941 .sum();
942 self.t_en = self.t_st + t_bases;
943
944 if self.strand == '+' {
946 self.q_st += q_consumed_before;
947 self.q_en = self.q_st + q_consumed_in;
948 } else {
949 self.q_en -= q_consumed_before;
950 self.q_st = self.q_en - q_consumed_in;
951 }
952
953 self.cigar = CigarString(new_cigar_ops);
954 self.cs_ops = new_cs_ops;
955
956 self.check_integrity().unwrap();
958 }
959
960 pub fn check_integrity(&mut self) -> PafResult<()> {
961 let (t_bases, q_bases, nmatch, aln_len) = self.infer_n_bases();
962 if self.t_en - self.t_st != t_bases {
963 return Err(Error::PafParseCigar {
964 msg: format!(
965 "target bases {} from cigar does not equal {}-{}={}\n{}\n",
966 t_bases,
967 self.t_en,
968 self.t_st,
969 self.t_en - self.t_st,
970 self
971 ),
972 });
973 }
974 if self.q_en - self.q_st != q_bases {
975 return Err(Error::PafParseCigar {
976 msg: format!(
977 "query bases {} from cigar does not equal {}-{}={}\n{}\n",
978 q_bases,
979 self.q_en,
980 self.q_st,
981 self.q_en - self.q_st,
982 self
983 ),
984 });
985 }
986
987 self.nmatch = nmatch;
989 self.aln_len = aln_len;
990
991 Ok(())
992 }
993
994 pub fn to_sam_string(&self, reader: Option<&faidx::Reader>) -> String {
1003 let mut clip_char = 'H';
1007 let seq = match reader {
1008 Some(reader) => {
1009 let seq = getfasta::fetch_fasta(
1010 reader,
1011 &self.q_name,
1012 0, self.q_len as usize,
1014 );
1015 clip_char = 'S';
1016 let seq = if self.strand == '-' {
1017 revcomp(seq)
1018 } else {
1019 seq
1020 };
1021 std::str::from_utf8(&seq).unwrap().to_string()
1022 }
1023 None => "*".to_string(),
1024 };
1025 let qual = "*".to_string();
1026 let flag = if self.strand == '-' { 16 } else { 0 };
1027 let mut leading_clip = if self.q_st > 0 {
1028 format!("{}{}", self.q_st, clip_char)
1029 } else {
1030 "".to_string()
1031 };
1032 let mut trailing_clip = if self.q_len - self.q_en > 0 {
1033 format!("{}{}", self.q_len - self.q_en, clip_char)
1034 } else {
1035 "".to_string()
1036 };
1037 if self.strand == '-' {
1038 std::mem::swap(&mut leading_clip, &mut trailing_clip);
1039 }
1040 let o_cigar = format!("{}{}{}", leading_clip, self.cigar, trailing_clip);
1041 format!(
1042 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
1043 self.q_name,
1044 flag,
1045 self.t_name,
1046 self.t_st + 1,
1047 self.mapq,
1048 o_cigar,
1049 "*",
1050 0,
1051 0,
1052 seq,
1053 qual
1054 )
1055 }
1056}
1057
1058impl PafRecord {
1059 pub fn write_to_buf(&self, buf: &mut String) {
1066 #[inline(always)]
1068 fn push_u64(buf: &mut String, v: u64) {
1069 let mut tmp = itoa::Buffer::new();
1070 buf.push_str(tmp.format(v));
1071 }
1072 #[inline(always)]
1073 fn push_u32(buf: &mut String, v: u32) {
1074 let mut tmp = itoa::Buffer::new();
1075 buf.push_str(tmp.format(v));
1076 }
1077
1078 buf.push_str(&self.q_name);
1079 buf.push('\t');
1080 push_u64(buf, self.q_len);
1081 buf.push('\t');
1082 push_u64(buf, self.q_st);
1083 buf.push('\t');
1084 push_u64(buf, self.q_en);
1085 buf.push('\t');
1086 buf.push(self.strand);
1087 buf.push('\t');
1088 buf.push_str(&self.t_name);
1089 buf.push('\t');
1090 push_u64(buf, self.t_len);
1091 buf.push('\t');
1092 push_u64(buf, self.t_st);
1093 buf.push('\t');
1094 push_u64(buf, self.t_en);
1095 buf.push('\t');
1096 push_u64(buf, self.nmatch);
1097 buf.push('\t');
1098 push_u64(buf, self.aln_len);
1099 buf.push('\t');
1100 push_u64(buf, self.mapq);
1101 buf.push_str("\tid:Z:");
1102 buf.push_str(&self.id);
1103 buf.push_str("\tcg:Z:");
1104
1105 for op in self.cigar.iter() {
1106 push_u32(buf, op.len());
1107 buf.push(match op {
1108 Match(_) => 'M',
1109 Ins(_) => 'I',
1110 Del(_) => 'D',
1111 RefSkip(_) => 'N',
1112 SoftClip(_) => 'S',
1113 HardClip(_) => 'H',
1114 Pad(_) => 'P',
1115 Equal(_) => '=',
1116 Diff(_) => 'X',
1117 });
1118 }
1119
1120 if let Some(ref cs) = self.cs_ops {
1121 buf.push_str("\tcs:Z:");
1122 for op in &cs.ops {
1123 match op {
1124 CsOp::Matches(n) => {
1125 buf.push(':');
1126 push_u32(buf, *n);
1127 }
1128 CsOp::MatchSeq(off, len) => {
1129 buf.push('=');
1130 buf.push_str(std::str::from_utf8(cs.seq(*off, *len)).unwrap());
1131 }
1132 CsOp::Mismatch(r, q) => {
1133 buf.push('*');
1134 buf.push(*r as char);
1135 buf.push(*q as char);
1136 }
1137 CsOp::Insertion(off, len) => {
1138 buf.push('+');
1139 buf.push_str(std::str::from_utf8(cs.seq(*off, *len)).unwrap());
1140 }
1141 CsOp::Deletion(off, len) => {
1142 buf.push('-');
1143 buf.push_str(std::str::from_utf8(cs.seq(*off, *len)).unwrap());
1144 }
1145 }
1146 }
1147 }
1148 }
1149}
1150
1151impl fmt::Display for PafRecord {
1152 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1153 let mut buf = String::with_capacity(256);
1154 self.write_to_buf(&mut buf);
1155 f.write_str(&buf)
1156 }
1157}
1158
1159pub struct PafWriter {
1162 out: std::io::BufWriter<std::io::StdoutLock<'static>>,
1163 buf: String,
1164}
1165
1166impl PafWriter {
1167 pub fn new() -> Self {
1168 PafWriter {
1169 out: std::io::BufWriter::with_capacity(
1170 64 * 1024,
1171 std::io::stdout().lock(),
1172 ),
1173 buf: String::with_capacity(512 * 1024),
1174 }
1175 }
1176
1177 pub fn write_rec(&mut self, rec: &PafRecord) {
1178 use std::io::Write;
1179 self.buf.clear();
1180 rec.write_to_buf(&mut self.buf);
1181 self.out.write_all(self.buf.as_bytes()).unwrap();
1182 self.out.write_all(b"\n").unwrap();
1183 }
1184}
1185
1186pub fn consumes_reference(cigar_opt: &Cigar) -> bool {
1187 matches!(
1188 cigar_opt,
1189 Match(_i) | Del(_i) | RefSkip(_i) | Diff(_i) | Equal(_i)
1190 )
1191}
1192pub fn consumes_query(cigar_opt: &Cigar) -> bool {
1199 matches!(
1200 cigar_opt,
1201 Match(_i) | Ins(_i) | SoftClip(_i) | Diff(_i) | Equal(_i)
1202 )
1203}
1204
1205pub fn is_match(cigar_opt: &Cigar) -> bool {
1214 matches!(cigar_opt, Match(_i) | Diff(_i) | Equal(_i))
1215}
1216
1217pub fn update_cigar_opt_len(opt: &Cigar, new_opt_len: u32) -> Cigar {
1225 match opt {
1226 Match(_) => Match(new_opt_len),
1227 Ins(_) => Ins(new_opt_len),
1228 Del(_) => Del(new_opt_len),
1229 RefSkip(_) => RefSkip(new_opt_len),
1230 HardClip(_) => HardClip(new_opt_len),
1231 SoftClip(_) => SoftClip(new_opt_len),
1232 Pad(_) => Pad(new_opt_len),
1233 Equal(_) => Equal(new_opt_len),
1234 Diff(_) => Diff(new_opt_len),
1235 }
1236}
1237
1238pub fn cigar_from_str(text: &str) -> PafResult<CigarString> {
1255 let bytes = text.as_bytes();
1256 let mut inner = Vec::new();
1257 let mut i = 0;
1258 let text_len = text.len();
1259 while i < text_len {
1260 let mut j = i;
1261 while j < text_len && bytes[j].is_ascii_digit() {
1262 j += 1;
1263 }
1264 let n = u32::from_str(&text[i..j]).map_err(|_| Error::PafParseCigar {
1265 msg: "expected integer".to_owned(),
1266 })?;
1267 let op = &text[j..j + 1];
1268 inner.push(match op {
1269 "M" => Cigar::Match(n),
1270 "I" => Cigar::Ins(n),
1271 "D" => Cigar::Del(n),
1272 "N" => Cigar::RefSkip(n),
1273 "H" => Cigar::HardClip(n),
1274 "S" => Cigar::SoftClip(n),
1275 "P" => Cigar::Pad(n),
1276 "=" => Cigar::Equal(n),
1277 "X" => Cigar::Diff(n),
1278 op => {
1279 return Err(Error::PafParseCigar {
1280 msg: format!("Cannot parse opt: {}", op),
1281 })
1282 }
1283 });
1284 i = j + 1;
1285 }
1286 Ok(CigarString(inner))
1287}
1288
1289pub fn cigar_swap_target_query(cigar: &CigarString, strand: char) -> CigarString {
1291 let mut new_cigar = Vec::new();
1293 for opt in cigar.into_iter() {
1294 let new_opt = match opt {
1295 Ins(l) => Del(*l),
1296 Del(l) => Ins(*l),
1297 _ => *opt,
1298 };
1299 new_cigar.push(new_opt);
1300 }
1301 if strand == '-' {
1302 new_cigar.reverse();
1303 }
1304 CigarString(new_cigar)
1305}
1306
1307pub fn paf_swap_query_and_target(paf: &PafRecord) -> PafRecord {
1309 let mut flipped = paf.clone();
1310 flipped.t_name = paf.q_name.clone();
1312 flipped.t_len = paf.q_len;
1313 flipped.t_st = paf.q_st;
1314 flipped.t_en = paf.q_en;
1315 flipped.q_name = paf.t_name.clone();
1317 flipped.q_len = paf.t_len;
1318 flipped.q_st = paf.t_st;
1319 flipped.q_en = paf.t_en;
1320
1321 flipped.cigar = cigar_swap_target_query(&paf.cigar, paf.strand);
1323
1324 flipped.cs_ops = paf.cs_ops.as_ref().map(|cs| {
1326 let mut new_ops: Vec<CsOp> = cs
1327 .ops
1328 .iter()
1329 .map(|op| match op {
1330 CsOp::Insertion(off, len) => CsOp::Deletion(*off, *len),
1331 CsOp::Deletion(off, len) => CsOp::Insertion(*off, *len),
1332 other => *other,
1333 })
1334 .collect();
1335 if paf.strand == '-' {
1336 new_ops.reverse();
1337 }
1338 CsOps {
1339 ops: new_ops,
1340 seq_data: cs.seq_data.clone(),
1341 }
1342 });
1343
1344 flipped
1345}
1346
1347pub fn make_fake_paf_rec() -> PafRecord {
1348 PafRecord::new("Q 10 2 10 - T 20 12 20 3 9 60 cg:Z:4M1I1D3=").unwrap()
1349}
1350
1351pub fn parse_cs_string(cs: &str) -> PafResult<(CigarString, CsOps)> {
1371 let bytes = cs.as_bytes();
1372 let length = bytes.len();
1373 let mut i = 0;
1374 let estimated_ops = length / 2;
1376 let mut cigar = Vec::with_capacity(estimated_ops);
1377 let mut ops = Vec::with_capacity(estimated_ops);
1378 let mut seq_data: Vec<u8> = Vec::with_capacity(length);
1380 while i < length {
1381 let cs_opt = bytes[i];
1382 i += 1; match cs_opt {
1384 b'=' => {
1385 let start = i;
1386 while i < length && matches!(bytes[i], b'A' | b'C' | b'G' | b'T' | b'N') {
1387 i += 1;
1388 }
1389 let l = (i - start) as u32;
1390 let offset = seq_data.len() as u32;
1391 seq_data.extend_from_slice(&bytes[start..i]);
1392 cigar.push(Cigar::Equal(l));
1393 ops.push(CsOp::MatchSeq(offset, l));
1394 }
1395 b':' => {
1396 let start = i;
1397 while i < length && bytes[i].is_ascii_digit() {
1398 i += 1;
1399 }
1400 let l = u32::from_str(&cs[start..i]).map_err(|_| Error::ParseIntError {
1401 msg: "Expected integer in cs :N operator".to_string(),
1402 })?;
1403 cigar.push(Cigar::Equal(l));
1404 ops.push(CsOp::Matches(l));
1405 }
1406 b'*' => {
1407 let ref_base = bytes[i];
1408 let query_base = bytes[i + 1];
1409 i += 2;
1410 cigar.push(Cigar::Diff(1));
1411 ops.push(CsOp::Mismatch(ref_base, query_base));
1412 }
1413 b'+' => {
1414 let start = i;
1415 while i < length && bytes[i].is_ascii_lowercase() {
1416 i += 1;
1417 }
1418 let l = (i - start) as u32;
1419 let offset = seq_data.len() as u32;
1420 seq_data.extend_from_slice(&bytes[start..i]);
1421 cigar.push(Cigar::Ins(l));
1422 ops.push(CsOp::Insertion(offset, l));
1423 }
1424 b'-' => {
1425 let start = i;
1426 while i < length && bytes[i].is_ascii_lowercase() {
1427 i += 1;
1428 }
1429 let l = (i - start) as u32;
1430 let offset = seq_data.len() as u32;
1431 seq_data.extend_from_slice(&bytes[start..i]);
1432 cigar.push(Cigar::Del(l));
1433 ops.push(CsOp::Deletion(offset, l));
1434 }
1435 b'~' => {
1436 return Err(Error::PafParseCS {
1437 msg: "Splice operations not yet supported.".to_string(),
1438 });
1439 }
1440 _ => {
1441 return Err(Error::PafParseCS {
1442 msg: format!("Unexpected operator in the cs string: {}", cs_opt as char),
1443 });
1444 }
1445 }
1446 }
1447 Ok((CigarString(cigar), CsOps { ops, seq_data }))
1448}
1449
1450pub fn cs_to_cigar(cs: &str) -> PafResult<CigarString> {
1465 cigar_from_cs(cs)
1466}
1467
1468pub fn cigar_from_cs(cs: &str) -> PafResult<CigarString> {
1485 let bytes = cs.as_bytes();
1486 let length = bytes.len();
1487 let mut i = 0;
1488 let estimated_ops = length / 2;
1489 let mut cigar = Vec::with_capacity(estimated_ops);
1490 while i < length {
1491 let cs_opt = bytes[i];
1492 i += 1;
1493 match cs_opt {
1494 b'=' => {
1495 let start = i;
1496 while i < length && matches!(bytes[i], b'A' | b'C' | b'G' | b'T' | b'N') {
1497 i += 1;
1498 }
1499 cigar.push(Cigar::Equal((i - start) as u32));
1500 }
1501 b':' => {
1502 let start = i;
1503 while i < length && bytes[i].is_ascii_digit() {
1504 i += 1;
1505 }
1506 let l = u32::from_str(&cs[start..i]).map_err(|_| Error::ParseIntError {
1507 msg: "Expected integer in cs :N operator".to_string(),
1508 })?;
1509 cigar.push(Cigar::Equal(l));
1510 }
1511 b'*' => {
1512 i += 2;
1513 cigar.push(Cigar::Diff(1));
1514 }
1515 b'+' => {
1516 let start = i;
1517 while i < length && bytes[i].is_ascii_lowercase() {
1518 i += 1;
1519 }
1520 cigar.push(Cigar::Ins((i - start) as u32));
1521 }
1522 b'-' => {
1523 let start = i;
1524 while i < length && bytes[i].is_ascii_lowercase() {
1525 i += 1;
1526 }
1527 cigar.push(Cigar::Del((i - start) as u32));
1528 }
1529 b'~' => {
1530 return Err(Error::PafParseCS {
1531 msg: "Splice operations not yet supported.".to_string(),
1532 });
1533 }
1534 _ => {
1535 return Err(Error::PafParseCS {
1536 msg: format!("Unexpected operator in the cs string: {}", cs_opt as char),
1537 });
1538 }
1539 }
1540 }
1541 Ok(CigarString(cigar))
1542}