Skip to main content

rustybam/
paf.rs

1use super::bed;
2use super::getfasta;
3use super::myio;
4use super::trim_overlap::trim_overlapping_pafs;
5use bio::alphabets::dna::revcomp;
6use core::fmt;
7use itertools::Itertools;
8use natord;
9use rust_htslib::bam::record::Cigar::*;
10use rust_htslib::bam::record::CigarString;
11use rust_htslib::bam::record::*;
12use rust_htslib::faidx;
13use std::collections::{HashMap, HashSet};
14use std::convert::TryFrom;
15use std::io::BufRead;
16use std::str::FromStr;
17
18// PAF_TAG regex removed — tags are parsed via direct string slicing (XX:Y:VALUE format)
19
20#[derive(Debug)]
21pub enum Error {
22    PafParseCigar { msg: String },
23    PafParseCS { msg: String },
24    ParseIntError { msg: String },
25    ParsePafColumn {},
26}
27type PafResult<T> = Result<T, crate::paf::Error>;
28
29#[derive(Debug)]
30pub struct Paf {
31    pub records: Vec<PafRecord>,
32    //pub records_by_contig: HashMap<String, Vec<&'a PafRecord>>,
33}
34
35impl Default for Paf {
36    fn default() -> Self {
37        Self::new()
38    }
39}
40
41impl Paf {
42    pub fn new() -> Paf {
43        Paf {
44            records: Vec::new(),
45            //records_by_contig: HashMap::new(),
46        }
47    }
48    /// read in the paf from a file pass "-" for stdin
49    /// # Example
50    /// ```
51    /// use rustybam::paf;
52    /// use std::fs::File;
53    /// use std::io::*;
54    /// let mut paf = paf::Paf::from_file(".test/asm_small.paf");
55    /// assert_eq!(paf.records.len(), 249);
56    ///
57    /// ```
58    pub fn from_file(file_name: &str) -> Paf {
59        let paf_file = myio::reader(file_name);
60        let mut paf = Paf::new();
61        // read the paf recs into a vector
62        for (index, line) in paf_file.lines().enumerate() {
63            log::trace!("{:?}", line);
64            match PafRecord::new(&line.unwrap()) {
65                Ok(mut rec) => {
66                    rec.check_integrity().unwrap();
67                    paf.records.push(rec);
68                }
69                Err(_) => eprintln!("\nUnable to parse PAF record. Skipping line {}", index + 1),
70            }
71            log::debug!("Read PAF record number: {}", index + 1);
72        }
73        paf
74    }
75
76    pub fn query_name_map(&mut self) -> HashMap<String, Vec<&PafRecord>> {
77        let mut records_by_contig = HashMap::new();
78        for rec in &self.records {
79            (*records_by_contig
80                .entry(rec.q_name.clone())
81                .or_insert_with(Vec::new))
82            .push(rec);
83        }
84        records_by_contig
85    }
86
87    pub fn filter_aln_pairs(&mut self, paired_len: u64) {
88        let mut dict = HashMap::new();
89        for rec in self.records.iter_mut() {
90            let aln_bp = dict
91                .entry((rec.t_name.clone(), rec.q_name.clone()))
92                .or_insert(0_u64);
93            *aln_bp += rec.t_en - rec.t_st;
94        }
95        self.records.retain(|rec| {
96            paired_len < *dict.get(&(rec.t_name.clone(), rec.q_name.clone())).unwrap()
97        });
98    }
99
100    pub fn filter_query_len(&mut self, min_query_len: u64) {
101        self.records.retain(|rec| rec.q_len > min_query_len);
102    }
103
104    /// Filter on alignment length
105    pub fn filter_aln_len(&mut self, min_aln_len: u64) {
106        self.records.retain(|rec| rec.t_en - rec.t_st > min_aln_len);
107    }
108
109    /// orient queries relative to their target (inverts if most bases are aligned rc).
110    pub fn orient(&mut self) {
111        let mut orient_order_dict = HashMap::new();
112        let mut t_names = HashSet::new();
113        // calculate whether a contig is mostly forward or reverse strand
114        // and determine the middle alignment position with respect to the target
115        for rec in &self.records {
116            let (orient, total_bp, order) = orient_order_dict
117                .entry((rec.t_name.clone(), rec.q_name.clone()))
118                .or_insert((0_i64, 0_u64, 0_u64));
119            // set the orientation of the query relative to the target
120            if rec.strand == '-' {
121                *orient -= (rec.q_en - rec.q_st) as i64;
122            } else {
123                *orient += (rec.q_en - rec.q_st) as i64;
124            }
125            // set a number that will determine the order of the contig
126            let weight = rec.t_en - rec.t_st;
127            *total_bp += weight;
128            *order += weight * (rec.t_st + rec.t_en) / 2;
129            // make a list of targets
130            t_names.insert(rec.t_name.clone());
131        }
132
133        // set the order and orientation of records
134        for rec in &mut self.records {
135            // set the order of the records
136            let (orient, total_bp, order) = orient_order_dict
137                .get(&(rec.t_name.clone(), rec.q_name.clone()))
138                .unwrap();
139            rec.order = *order / *total_bp;
140
141            // reverse record if it is mostly on the rc
142            if *orient < 0 {
143                rec.q_name = format!("{}-", rec.q_name);
144                let new_st = rec.q_len - rec.q_en;
145                let new_en = rec.q_len - rec.q_st;
146                rec.q_st = new_st;
147                rec.q_en = new_en;
148                rec.strand = if rec.strand == '+' { '-' } else { '+' };
149            } else {
150                rec.q_name = format!("{}+", rec.q_name);
151            }
152        }
153    }
154
155    /// scaffold oriented contigs into one fake super contig
156    pub fn scaffold(&mut self, spacer_size: u64) {
157        // sort the records by their target name and order
158        self.records.sort_by(|a, b| {
159            a.t_name
160                .cmp(&b.t_name) // group by target
161                .then(a.order.cmp(&b.order)) // order query by position in target
162                .then(a.q_st.cmp(&b.q_st)) // order by position in query
163        });
164
165        // group by t_name
166        for (_t_name, t_recs) in &self.records.iter_mut().group_by(|rec| rec.t_name.clone()) {
167            let mut t_recs: Vec<&mut PafRecord> = t_recs.collect();
168            // sort recs by order
169            t_recs.sort_by(|a, b| {
170                a.order
171                    .cmp(&b.order) // order query by position in target
172                    .then(a.q_st.cmp(&b.q_st)) // order by position in query
173            });
174
175            // new scaffold name
176            let scaffold_name = t_recs
177                .iter()
178                .map(|rec| rec.q_name.clone())
179                .unique()
180                .collect::<Vec<String>>()
181                .join("::");
182
183            let mut scaffold_len = 0_u64;
184            for (_q_name, q_recs) in &t_recs.iter_mut().group_by(|rec| rec.q_name.clone()) {
185                let q_recs: Vec<&mut &mut PafRecord> = q_recs.collect();
186                let q_min = q_recs.iter().map(|rec| rec.q_st).min().unwrap_or(0);
187                let q_max = q_recs.iter().map(|rec| rec.q_en).max().unwrap_or(0);
188                let added_q_bases = q_max - q_min;
189                for rec in q_recs {
190                    rec.q_st = rec.q_st - q_min + scaffold_len;
191                    rec.q_en = rec.q_en - q_min + scaffold_len;
192                }
193                scaffold_len += added_q_bases + spacer_size;
194            }
195            // remove padding insert on the end of rec
196            scaffold_len -= spacer_size;
197
198            for rec in t_recs {
199                rec.q_name = scaffold_name.clone();
200                rec.q_len = scaffold_len;
201            }
202        }
203    }
204
205    /// Identify overlapping pairs in Paf set
206    pub fn overlapping_paf_recs(
207        &mut self,
208        match_score: i32,
209        diff_score: i32,
210        indel_score: i32,
211        remove_contained: bool,
212    ) {
213        // remove trailing indels in all cases
214        for rec in &mut self.records {
215            rec.remove_trailing_indels();
216        }
217
218        let mut overlap_pairs = Vec::new();
219        self.records.sort_by_key(|rec| rec.q_name.clone());
220        let mut contained_indexes = vec![false; self.records.len()];
221
222        // check if there are enough records to even try this operation
223        if self.records.len() < 2 {
224            return;
225        }
226
227        for i in 0..(self.records.len() - 1) {
228            let rec1 = &self.records[i];
229            let rgn1 = rec1.get_query_as_region();
230            let mut j = i + 1;
231            while j < self.records.len() && rec1.q_name == self.records[j].q_name {
232                let rec2 = &self.records[j];
233                let rgn2 = rec2.get_query_as_region();
234                // count overlap
235                let overlap = bed::get_overlap(&rgn1, &rgn2);
236                // check if rec2 is contained
237                if overlap < 1 {
238                    j += 1;
239                    continue;
240                } else if overlap == (rec2.q_en - rec2.q_st) {
241                    contained_indexes[j] = true;
242                    log::debug!("{}\n^is contained in another alignment", rec2);
243                } else if overlap == (rec1.q_en - rec1.q_st) {
244                    contained_indexes[i] = true;
245                    log::debug!("{}\n^is contained in another alignment", rec1);
246                } else {
247                    // put recs in left, right order
248                    if rec1.q_st <= rec2.q_st {
249                        overlap_pairs.push((overlap, i, j));
250                    } else {
251                        overlap_pairs.push((overlap, j, i));
252                    }
253                }
254                // go to next
255                j += 1;
256            }
257        }
258        overlap_pairs.sort_by_key(|rec| u64::MAX - rec.0);
259        log::debug!("{} overlapping pairs found", overlap_pairs.len());
260        let mut q_seen: HashSet<String> = HashSet::new();
261        let mut unseen = 0;
262        for (_overlap, i, j) in overlap_pairs {
263            let mut left = self.records[i].clone();
264            let mut right = self.records[j].clone();
265            let q_name = left.q_name.clone();
266            // if we have not seen the q_name before it cannot be
267            // in conflict with previous trimming steps
268            if !q_seen.contains(&q_name) {
269                trim_overlapping_pafs(&mut left, &mut right, match_score, diff_score, indel_score);
270                log::trace!("{}", left);
271                log::trace!("{}", right);
272                self.records[i] = left;
273                self.records[j] = right;
274                q_seen.insert(q_name);
275            } else {
276                unseen += 1;
277            }
278        }
279
280        if unseen > 0 {
281            // recursively call for next overlap
282            self.overlapping_paf_recs(match_score, diff_score, indel_score, remove_contained);
283        } else if remove_contained {
284            let n_to_remove = contained_indexes.iter().filter(|&x| *x).count();
285            log::info!("Removing {} contained alignments.", n_to_remove);
286            log::info!("{} total alignments.", self.records.len());
287            let mut new_records = Vec::new();
288            assert!(self.records.len() == contained_indexes.len());
289            for (i, rec) in self.records.iter().enumerate() {
290                if !contained_indexes[i] {
291                    new_records.push(rec.clone());
292                }
293            }
294            self.records = new_records;
295            log::info!("{} total alignments.", self.records.len());
296            // remove contained records
297            //self.records.retain(|rec| !rec.contained);
298        }
299    }
300
301    /// Make a SAM header from a Paf
302    /// # Example
303    /// ```
304    /// use rustybam::paf;
305    /// use std::fs::File;
306    /// use std::io::*;
307    /// let mut paf = paf::Paf::from_file(".test/asm_small.paf");
308    /// let header = paf.sam_header();
309    /// assert_eq!(header[0..3], "@HD".to_string());
310    /// assert_eq!(header.split("\n").count(), 5);
311    /// ```
312    pub fn sam_header(&self) -> String {
313        /*
314        @HD	VN:1.6	SO:coordinate
315        @SQ	SN:chr1	LN:248387497
316        ...
317        @SQ	SN:chrM	LN:16569
318        @SQ	SN:chrY	LN:57227415
319        @PG	ID:unimap	PN:unimap	VN:0.1-r41	CL:
320        */
321        let mut header = "@HD\tVN:1.6\n".to_string();
322
323        // sort names naturally
324        let mut names: Vec<(String, u64)> = self
325            .records
326            .iter()
327            .map(|rec| (rec.t_name.clone(), rec.t_len))
328            .unique()
329            .collect();
330
331        names.sort_by(|a, b| natord::compare(&a.0, &b.0));
332        for (name, length) in names {
333            header.push_str(&format!("@SQ\tSN:{}\tLN:{}\n", name, length));
334        }
335        header.push_str("@PG\tID:rustybam\tPN:rustybam");
336        header
337    }
338}
339
340/// A single cs-tag operation, stored in 1:1 correspondence with CIGAR ops.
341/// Sequence data is stored externally in `CsOps::seq_data`; variants that
342/// carry sequence use `(offset, len)` into that shared buffer.
343#[derive(Debug, Clone, Copy)]
344pub enum CsOp {
345    /// `:N` — N matching bases (compact form, no sequence)
346    Matches(u32),
347    /// `=ACGT` — matching bases with explicit sequence (offset, len into seq_data)
348    MatchSeq(u32, u32),
349    /// `*xy` — single-base mismatch (ref base, query base)
350    Mismatch(u8, u8),
351    /// `+acgt` — insertion of query bases (offset, len into seq_data)
352    Insertion(u32, u32),
353    /// `-acgt` — deletion of reference bases (offset, len into seq_data)
354    Deletion(u32, u32),
355}
356
357impl CsOp {
358    /// Trim this op to keep only bases at [skip..skip+keep).
359    /// Adjusts offsets into the shared seq_data buffer — no allocation.
360    pub fn trim(&self, skip: u32, keep: u32) -> CsOp {
361        match self {
362            CsOp::Matches(_) => CsOp::Matches(keep),
363            CsOp::MatchSeq(off, _) => CsOp::MatchSeq(off + skip, keep),
364            CsOp::Mismatch(r, q) => {
365                debug_assert!(skip == 0 && keep == 1);
366                CsOp::Mismatch(*r, *q)
367            }
368            CsOp::Insertion(off, _) => CsOp::Insertion(off + skip, keep),
369            CsOp::Deletion(off, _) => CsOp::Deletion(off + skip, keep),
370        }
371    }
372}
373
374/// Bundled cs-tag operations with a single shared buffer for all sequence data.
375/// This avoids per-op heap allocations: one `Vec<u8>` instead of thousands.
376#[derive(Debug, Clone)]
377pub struct CsOps {
378    pub ops: Vec<CsOp>,
379    pub seq_data: Vec<u8>,
380}
381
382impl CsOps {
383    /// Get the sequence bytes for an op that carries sequence data.
384    #[inline]
385    pub fn seq(&self, offset: u32, len: u32) -> &[u8] {
386        &self.seq_data[offset as usize..(offset + len) as usize]
387    }
388
389    /// Get the sequence bytes for any op, returning None for Matches/Mismatch.
390    #[inline]
391    pub fn op_seq(&self, op: &CsOp) -> Option<&[u8]> {
392        match op {
393            CsOp::MatchSeq(off, len)
394            | CsOp::Insertion(off, len)
395            | CsOp::Deletion(off, len) => {
396                Some(&self.seq_data[*off as usize..(*off + *len) as usize])
397            }
398            _ => None,
399        }
400    }
401
402    /// Format the full cs-tag string (without the `cs:Z:` prefix).
403    pub fn to_cs_string(&self) -> String {
404        let mut buf = String::new();
405        for op in &self.ops {
406            match op {
407                CsOp::Matches(n) => {
408                    buf.push(':');
409                    let mut tmp = itoa::Buffer::new();
410                    buf.push_str(tmp.format(*n));
411                }
412                CsOp::MatchSeq(off, len) => {
413                    buf.push('=');
414                    buf.push_str(std::str::from_utf8(self.seq(*off, *len)).unwrap());
415                }
416                CsOp::Mismatch(r, q) => {
417                    buf.push('*');
418                    buf.push(*r as char);
419                    buf.push(*q as char);
420                }
421                CsOp::Insertion(off, len) => {
422                    buf.push('+');
423                    buf.push_str(std::str::from_utf8(self.seq(*off, *len)).unwrap());
424                }
425                CsOp::Deletion(off, len) => {
426                    buf.push('-');
427                    buf.push_str(std::str::from_utf8(self.seq(*off, *len)).unwrap());
428                }
429            }
430        }
431        buf
432    }
433}
434
435impl fmt::Display for CsOps {
436    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
437        f.write_str(&self.to_cs_string())
438    }
439}
440
441#[derive(Debug, Clone)]
442pub struct PafRecord {
443    pub q_name: String,
444    pub q_len: u64,
445    pub q_st: u64,
446    pub q_en: u64,
447    pub strand: char,
448    pub t_name: String,
449    pub t_len: u64,
450    pub t_st: u64,
451    pub t_en: u64,
452    pub nmatch: u64,
453    pub aln_len: u64,
454    pub mapq: u64,
455    pub cigar: CigarString,
456    /// Parsed cs-tag ops with shared sequence buffer, in 1:1 correspondence
457    /// with `cigar`. `None` when input was CIGAR (cg tag); `Some` when input
458    /// was cs tag.
459    pub cs_ops: Option<CsOps>,
460    pub tags: String,
461    pub id: String,
462    pub order: u64,
463    pub contained: bool,
464}
465
466impl PafRecord {
467    /// # Example
468    /// ```
469    /// use rustybam::paf;
470    /// let _paf = paf::PafRecord::new("A 1 2 3 + B 1 2 3 10 11 60").unwrap();
471    /// let rec = paf::make_fake_paf_rec();
472    /// assert_eq!("4M1I1D3=", rec.cigar.to_string());
473    ///
474    /// ```
475    pub fn new(line: &str) -> PafResult<PafRecord> {
476        let t: Vec<&str> = line.split_ascii_whitespace().collect();
477        assert!(t.len() >= 12); // must have all required columns
478
479        // Two-pass tag scanning: first find cs/cg positions (tag order is not
480        // guaranteed by the PAF spec), then parse only the preferred one.
481        let mut tags = "".to_string();
482        let mut cs_idx: Option<usize> = None;
483        let mut cg_idx: Option<usize> = None;
484        for (i, token) in t.iter().enumerate().skip(12) {
485            debug_assert!(
486                token.len() >= 5
487                    && token.as_bytes()[2] == b':'
488                    && token.as_bytes()[4] == b':',
489                "Malformed PAF tag: {}",
490                token
491            );
492            let tag = &token[..2];
493            if tag == "cs" {
494                cs_idx = Some(i);
495            } else if tag == "cg" {
496                cg_idx = Some(i);
497            } else {
498                tags.push('\t');
499                tags.push_str(token);
500            }
501        }
502
503        // Parse cs (preferred) or cg — never both.
504        // cs gives us both cigar and cs_ops; cg gives us only cigar.
505        let mut cigar = CigarString(vec![]);
506        let mut cs_ops: Option<CsOps> = None;
507        if let Some(idx) = cs_idx {
508            let value = &t[idx][5..];
509            let (parsed_cigar, parsed_ops) = parse_cs_string(value)?;
510            cigar = parsed_cigar;
511            cs_ops = Some(parsed_ops);
512        } else if let Some(idx) = cg_idx {
513            let value = &t[idx][5..];
514            log::trace!("parsing cigar of length: {}", value.len());
515            cigar =
516                CigarString::try_from(value.as_bytes()).expect("Unable to parse cigar string.");
517        }
518
519        // make the record
520        let rec = PafRecord {
521            q_name: t[0].to_string(),
522            q_len: t[1].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
523            q_st: t[2].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
524            q_en: t[3].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
525            strand: t[4].parse::<char>().map_err(|_| Error::ParsePafColumn {})?,
526            t_name: t[5].to_string(),
527            t_len: t[6].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
528            t_st: t[7].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
529            t_en: t[8].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
530            nmatch: t[9].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
531            aln_len: t[10].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
532            mapq: t[11].parse::<u64>().map_err(|_| Error::ParsePafColumn {})?,
533            cigar,
534            cs_ops,
535            tags,
536            id: "".to_string(),
537            order: 0,
538            contained: false,
539        };
540        Ok(rec)
541    }
542
543    #[must_use]
544    pub fn small_copy(&self) -> PafRecord {
545        PafRecord {
546            q_name: self.q_name.clone(),
547            q_len: self.q_len,
548            q_st: self.q_st,
549            q_en: self.q_en,
550            strand: self.strand,
551            t_name: self.t_name.clone(),
552            t_len: self.t_len,
553            t_st: self.t_st,
554            t_en: self.t_en,
555            nmatch: self.nmatch,
556            aln_len: self.aln_len,
557            mapq: self.mapq,
558            cigar: CigarString(Vec::new()),
559            cs_ops: None,
560            tags: self.tags.clone(),
561            id: self.id.clone(),
562            order: self.order,
563            contained: self.contained,
564        }
565    }
566
567    /// This function returns a region type from the paf query
568    pub fn get_query_as_region(&self) -> bed::Region {
569        bed::Region {
570            name: self.q_name.clone(),
571            st: self.q_st,
572            en: self.q_en,
573            ..Default::default()
574        }
575    }
576
577    /// This function returns a region type from the paf target
578    /// Example:
579    /// ```
580    /// use rustybam::bed::*;
581    /// use rustybam::paf::*;
582    /// let paf = PafRecord::new("Q 10 0 10 + T 20 12 20 3 9 60 cg:Z:7=1X2=").unwrap().get_target_as_region();
583    /// let rgn = Region {name: "T".to_string(), st: 12, en: 20, ..Default::default()};
584    /// assert_eq!((rgn.name, rgn.st, rgn.en),
585    ///             (paf.name, paf.st, paf.en)
586    /// );
587    /// ```
588    pub fn get_target_as_region(&self) -> bed::Region {
589        bed::Region {
590            name: self.t_name.clone(),
591            st: self.t_st,
592            en: self.t_en,
593            ..Default::default()
594        }
595    }
596
597    pub fn collapse_long_cigar(cigar: &CigarString) -> CigarString {
598        let mut rtn = Vec::new();
599        let mut pre_opt = cigar.0[0];
600        let mut pre_len = 1;
601        let mut idx = 1;
602        while idx < cigar.len() {
603            let cur_opt = cigar.0[idx];
604            if std::mem::discriminant(&cur_opt) == std::mem::discriminant(&pre_opt) {
605                pre_len += 1;
606            } else {
607                rtn.push(update_cigar_opt_len(&pre_opt, pre_len));
608                pre_opt = cur_opt;
609                pre_len = 1;
610            }
611            idx += 1;
612        }
613        rtn.push(update_cigar_opt_len(&pre_opt, pre_len));
614        CigarString(rtn)
615    }
616
617    pub fn paf_overlaps_rgn(&self, rgn: &bed::Region) -> bool {
618        if self.t_name != rgn.name {
619            return false;
620        }
621        self.t_en > rgn.st && self.t_st < rgn.en
622    }
623
624    /// Return a tuple with the n of bases in the query and
625    /// target inferred from the cigar string
626    pub fn infer_n_bases(&mut self) -> (u64, u64, u64, u64) {
627        let mut t_bases = 0;
628        let mut q_bases = 0;
629        let mut n_matches = 0;
630        let mut aln_len = 0;
631        for opt in self.cigar.into_iter() {
632            if consumes_reference(opt) {
633                t_bases += opt.len()
634            }
635            if consumes_query(opt) {
636                q_bases += opt.len()
637            }
638            if is_match(opt) {
639                n_matches += opt.len()
640            }
641            aln_len += opt.len();
642        }
643        (
644            t_bases as u64,
645            q_bases as u64,
646            n_matches as u64,
647            aln_len as u64,
648        )
649    }
650
651    pub fn remove_trailing_indels(&mut self) {
652        // if we check integrity, here it may fail due to indels on the ends
653        // self.check_integrity().unwrap();
654
655        let cigar_len = self.cigar.len();
656
657        // find start to trim
658        let mut st_opt = *self.cigar.first().unwrap();
659        let mut remove_st_t = 0;
660        let mut remove_st_q = 0;
661        let mut remove_st_opts = 0;
662        let mut removed_st_opts = Vec::new();
663        while matches!(st_opt, Ins(_) | Del(_)) {
664            if matches!(st_opt, Del(_)) {
665                // consumes reference
666                remove_st_t += st_opt.len();
667                // TODO learn why I need this
668                remove_st_q += 1;
669            } else {
670                remove_st_q += st_opt.len();
671                // TODO learn why I need this
672                // TODO handle the case when it is a del and then and ins
673                // remove_st_t += 1;
674            }
675            remove_st_opts += 1;
676            removed_st_opts.push(st_opt);
677            if remove_st_opts < cigar_len {
678                st_opt = self.cigar[remove_st_opts];
679            } else {
680                break;
681            }
682        }
683
684        // remove extra counts put in my the case of Del followed by Ins
685        if removed_st_opts.len() > 1 {
686            for i in 0..(removed_st_opts.len() - 1) {
687                let pre_opt = removed_st_opts[i];
688                let cur_opt = removed_st_opts[i + 1];
689                if (matches!(pre_opt, Del(_)) && matches!(cur_opt, Ins(_)))
690                    || (matches!(pre_opt, Ins(_)) && matches!(cur_opt, Del(_)))
691                {
692                    remove_st_t += 1;
693                    remove_st_q -= 1;
694                }
695            }
696        }
697
698        // find ends to trim
699        let mut en_opt = *self.cigar.last().unwrap();
700        let mut remove_en_t = 0;
701        let mut remove_en_q = 0;
702        let mut remove_en_opts = 0;
703        let mut removed_en_opts = Vec::new();
704        while matches!(en_opt, Ins(_) | Del(_)) {
705            if matches!(en_opt, Del(_)) {
706                // consumes reference
707                remove_en_t += en_opt.len();
708            } else {
709                remove_en_q += en_opt.len();
710            }
711            remove_en_opts += 1;
712            removed_en_opts.push(en_opt);
713            if cigar_len - remove_en_opts > 0 {
714                en_opt = self.cigar[cigar_len - 1 - remove_en_opts];
715            } else {
716                break;
717            }
718        }
719
720        // log that we did something
721        if remove_en_opts > 0 || remove_st_opts > 0 {
722            self.id += &format!(
723                "_TO.{}.{}",
724                CigarString(removed_st_opts.clone()),
725                CigarString(removed_en_opts.clone())
726            );
727        }
728
729        // some logging.
730        if remove_en_opts > 0 || remove_st_opts > 0 {
731            log::debug!(
732                "\nRemoved {} leading and {} trailing indels:\n{}\n{}\ntarget changes:{},{}\nquery changes: {},{}\n{}:{}-{}\n{}:{}-{}",
733                remove_st_opts,
734                remove_en_opts,
735                CigarString(removed_st_opts),
736                CigarString(removed_en_opts),
737                remove_st_t,
738                remove_en_t,
739                remove_st_q,
740                remove_en_q,
741                self.q_name,
742                self.q_st,
743                self.q_en,
744                self.t_name,
745                self.t_st,
746                self.t_en,
747            );
748        }
749
750        // update the cigar string (and cs_ops if present)
751        self.cigar = CigarString(self.cigar.0[remove_st_opts..].to_vec());
752        self.cigar.0.truncate(self.cigar.len() - remove_en_opts);
753        if let Some(ref mut cs) = self.cs_ops {
754            cs.ops = cs.ops[remove_st_opts..].to_vec();
755            cs.ops.truncate(cs.ops.len() - remove_en_opts);
756            // seq_data stays as-is; offsets in remaining ops are still valid
757        }
758
759        // update the target coordinates
760        self.t_st += remove_st_t as u64;
761        self.t_en -= remove_en_t as u64;
762
763        // update the query coordinates if rc
764        if self.strand == '-' {
765            std::mem::swap(&mut remove_st_q, &mut remove_en_q);
766        }
767        // fix the query positions that need to be
768        self.q_st += remove_st_q as u64;
769        self.q_en -= remove_en_q as u64;
770
771        // check we removed the indels
772        if !self.cigar.is_empty() {
773            let st_opt = *self.cigar.first().unwrap();
774            let en_opt = *self.cigar.last().unwrap();
775            if matches!(st_opt, Ins(_) | Del(_)) || matches!(en_opt, Ins(_) | Del(_)) {
776                eprintln!("Why are there still indels?\n{}", self);
777                //self.remove_trailing_indels();
778            }
779        }
780
781        // make sure we did not break the cigar
782        self.check_integrity().unwrap();
783    }
784
785    /// Truncate this record to keep only the portion within [new_q_st, new_q_en)
786    /// in query coordinates. Walks compressed CIGAR ops directly — O(n_cigar_ops).
787    pub fn truncate_record_by_query(&mut self, new_q_st: u64, new_q_en: u64) {
788        // checks
789        assert!(new_q_st >= self.q_st, "New start is less than old start.");
790        assert!(new_q_en <= self.q_en, "New end is greater than old end.");
791
792        if new_q_st == self.q_st && new_q_en == self.q_en {
793            return;
794        }
795
796        let cs_data = self.cs_ops.take(); // take ownership for parallel processing
797        let mut new_cs_ops: Option<CsOps> = cs_data.as_ref().map(|cd| CsOps {
798            ops: Vec::new(),
799            seq_data: cd.seq_data.clone(), // share the same backing buffer
800        });
801
802        let mut q_pos = if self.strand == '+' { self.q_st } else { self.q_en };
803        let mut new_cigar_ops: Vec<Cigar> = Vec::new();
804        let mut t_before: u64 = 0;
805        let mut q_consumed_before: u64 = 0;
806        let mut q_consumed_in: u64 = 0;
807        let mut started = false;
808        let mut finished = false;
809
810        for (ci, op) in self.cigar.into_iter().enumerate() {
811            if finished {
812                break;
813            }
814            let op_len = op.len() as u64;
815            let moves_t = consumes_reference(op);
816            let moves_q = consumes_query(op);
817
818            if !moves_q {
819                // Deletion or ref skip — doesn't advance query
820                if started {
821                    new_cigar_ops.push(*op);
822                    if let (Some(ref mut ncs), Some(ref cs)) = (&mut new_cs_ops, &cs_data) {
823                        ncs.ops.push(cs.ops[ci]);
824                    }
825                } else if moves_t {
826                    t_before += op_len;
827                }
828                continue;
829            }
830
831            // Query-consuming op: compute absolute query range [q_lo, q_hi)
832            let (q_lo, q_hi) = if self.strand == '+' {
833                let lo = q_pos;
834                q_pos += op_len;
835                (lo, q_pos)
836            } else {
837                q_pos -= op_len;
838                (q_pos, q_pos + op_len)
839            };
840
841            // Check overlap with [new_q_st, new_q_en)
842            let overlap_start = std::cmp::max(q_lo, new_q_st);
843            let overlap_end = std::cmp::min(q_hi, new_q_en);
844            if overlap_start >= overlap_end {
845                // No overlap
846                if !started {
847                    if moves_t {
848                        t_before += op_len;
849                    }
850                    q_consumed_before += op_len;
851                }
852                continue;
853            }
854
855            // Bases to skip at the CIGAR-start side of this op
856            let skip_before_cigar = if self.strand == '+' {
857                overlap_start - q_lo
858            } else {
859                q_hi - overlap_end
860            };
861            let keep = overlap_end - overlap_start;
862
863            if !started {
864                started = true;
865                if moves_t {
866                    t_before += skip_before_cigar;
867                }
868                q_consumed_before += skip_before_cigar;
869            }
870
871            q_consumed_in += keep;
872            new_cigar_ops.push(update_cigar_opt_len(op, keep as u32));
873            if let (Some(ref mut ncs), Some(ref cs)) = (&mut new_cs_ops, &cs_data) {
874                ncs.ops.push(cs.ops[ci].trim(skip_before_cigar as u32, keep as u32));
875            }
876
877            // Check if we've reached the far boundary
878            if (self.strand == '+' && overlap_end >= new_q_en)
879                || (self.strand == '-' && overlap_start <= new_q_st)
880            {
881                finished = true;
882            }
883        }
884
885        if new_cigar_ops.is_empty() {
886            self.cs_ops = new_cs_ops;
887            return;
888        }
889
890        // Remove leading indels and adjust coordinates
891        let mut lead_t: u64 = 0;
892        let mut lead_q: u64 = 0;
893        while !new_cigar_ops.is_empty() {
894            let first = new_cigar_ops[0];
895            if matches!(first, Match(_) | Equal(_) | Diff(_)) {
896                break;
897            }
898            if consumes_reference(&first) {
899                lead_t += first.len() as u64;
900            }
901            if consumes_query(&first) {
902                lead_q += first.len() as u64;
903            }
904            new_cigar_ops.remove(0);
905            if let Some(ref mut ncs) = new_cs_ops {
906                ncs.ops.remove(0);
907            }
908        }
909        t_before += lead_t;
910        q_consumed_before += lead_q;
911        q_consumed_in -= lead_q;
912
913        // Remove trailing indels
914        let mut trail_q: u64 = 0;
915        while !new_cigar_ops.is_empty() {
916            let last = *new_cigar_ops.last().unwrap();
917            if matches!(last, Match(_) | Equal(_) | Diff(_)) {
918                break;
919            }
920            if consumes_query(&last) {
921                trail_q += last.len() as u64;
922            }
923            new_cigar_ops.pop();
924            if let Some(ref mut ncs) = new_cs_ops {
925                ncs.ops.pop();
926            }
927        }
928        q_consumed_in -= trail_q;
929
930        if new_cigar_ops.is_empty() {
931            self.cs_ops = new_cs_ops;
932            return;
933        }
934
935        // Compute new target coordinates
936        self.t_st += t_before;
937        let t_bases: u64 = new_cigar_ops
938            .iter()
939            .filter(|op| consumes_reference(op))
940            .map(|op| op.len() as u64)
941            .sum();
942        self.t_en = self.t_st + t_bases;
943
944        // Set query coordinates
945        if self.strand == '+' {
946            self.q_st += q_consumed_before;
947            self.q_en = self.q_st + q_consumed_in;
948        } else {
949            self.q_en -= q_consumed_before;
950            self.q_st = self.q_en - q_consumed_in;
951        }
952
953        self.cigar = CigarString(new_cigar_ops);
954        self.cs_ops = new_cs_ops;
955
956        // check integrity and update aln_len and nmatch
957        self.check_integrity().unwrap();
958    }
959
960    pub fn check_integrity(&mut self) -> PafResult<()> {
961        let (t_bases, q_bases, nmatch, aln_len) = self.infer_n_bases();
962        if self.t_en - self.t_st != t_bases {
963            return Err(Error::PafParseCigar {
964                msg: format!(
965                    "target bases {} from cigar does not equal {}-{}={}\n{}\n",
966                    t_bases,
967                    self.t_en,
968                    self.t_st,
969                    self.t_en - self.t_st,
970                    self
971                ),
972            });
973        }
974        if self.q_en - self.q_st != q_bases {
975            return Err(Error::PafParseCigar {
976                msg: format!(
977                    "query bases {} from cigar does not equal {}-{}={}\n{}\n",
978                    q_bases,
979                    self.q_en,
980                    self.q_st,
981                    self.q_en - self.q_st,
982                    self
983                ),
984            });
985        }
986
987        // update other fields
988        self.nmatch = nmatch;
989        self.aln_len = aln_len;
990
991        Ok(())
992    }
993
994    /// Print the paf record as a SAM record
995    /// Example:
996    /// ```
997    /// use rustybam::bed::*;
998    /// use rustybam::paf::*;
999    /// let paf = PafRecord::new("Q 10 0 10 + T 20 12 20 3 9 60 cg:Z:7=1X2=").unwrap();
1000    /// let sam = paf.to_sam_string(None);
1001    /// ```
1002    pub fn to_sam_string(&self, reader: Option<&faidx::Reader>) -> String {
1003        /*
1004        m64062_190807_194840/133628256/ccs	0	chr1	1	60	396=	*	0	0   *   *
1005        */
1006        let mut clip_char = 'H';
1007        let seq = match reader {
1008            Some(reader) => {
1009                let seq = getfasta::fetch_fasta(
1010                    reader,
1011                    &self.q_name,
1012                    0, //self.q_st as usize,
1013                    self.q_len as usize,
1014                );
1015                clip_char = 'S';
1016                let seq = if self.strand == '-' {
1017                    revcomp(seq)
1018                } else {
1019                    seq
1020                };
1021                std::str::from_utf8(&seq).unwrap().to_string()
1022            }
1023            None => "*".to_string(),
1024        };
1025        let qual = "*".to_string();
1026        let flag = if self.strand == '-' { 16 } else { 0 };
1027        let mut leading_clip = if self.q_st > 0 {
1028            format!("{}{}", self.q_st, clip_char)
1029        } else {
1030            "".to_string()
1031        };
1032        let mut trailing_clip = if self.q_len - self.q_en > 0 {
1033            format!("{}{}", self.q_len - self.q_en, clip_char)
1034        } else {
1035            "".to_string()
1036        };
1037        if self.strand == '-' {
1038            std::mem::swap(&mut leading_clip, &mut trailing_clip);
1039        }
1040        let o_cigar = format!("{}{}{}", leading_clip, self.cigar, trailing_clip);
1041        format!(
1042            "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
1043            self.q_name,
1044            flag,
1045            self.t_name,
1046            self.t_st + 1,
1047            self.mapq,
1048            o_cigar,
1049            "*",
1050            0,
1051            0,
1052            seq,
1053            qual
1054        )
1055    }
1056}
1057
1058impl PafRecord {
1059    /// Write this record into a reusable String buffer, avoiding per-record
1060    /// heap allocation. The caller should call `buf.clear()` before each use
1061    /// and reuse the same buffer across records.
1062    ///
1063    /// Uses `itoa` for integer formatting to bypass `core::fmt` dynamic dispatch,
1064    /// which was measured at ~57% of main-thread time in profiling.
1065    pub fn write_to_buf(&self, buf: &mut String) {
1066        // Helper: append a u64 via itoa (no fmt overhead)
1067        #[inline(always)]
1068        fn push_u64(buf: &mut String, v: u64) {
1069            let mut tmp = itoa::Buffer::new();
1070            buf.push_str(tmp.format(v));
1071        }
1072        #[inline(always)]
1073        fn push_u32(buf: &mut String, v: u32) {
1074            let mut tmp = itoa::Buffer::new();
1075            buf.push_str(tmp.format(v));
1076        }
1077
1078        buf.push_str(&self.q_name);
1079        buf.push('\t');
1080        push_u64(buf, self.q_len);
1081        buf.push('\t');
1082        push_u64(buf, self.q_st);
1083        buf.push('\t');
1084        push_u64(buf, self.q_en);
1085        buf.push('\t');
1086        buf.push(self.strand);
1087        buf.push('\t');
1088        buf.push_str(&self.t_name);
1089        buf.push('\t');
1090        push_u64(buf, self.t_len);
1091        buf.push('\t');
1092        push_u64(buf, self.t_st);
1093        buf.push('\t');
1094        push_u64(buf, self.t_en);
1095        buf.push('\t');
1096        push_u64(buf, self.nmatch);
1097        buf.push('\t');
1098        push_u64(buf, self.aln_len);
1099        buf.push('\t');
1100        push_u64(buf, self.mapq);
1101        buf.push_str("\tid:Z:");
1102        buf.push_str(&self.id);
1103        buf.push_str("\tcg:Z:");
1104
1105        for op in self.cigar.iter() {
1106            push_u32(buf, op.len());
1107            buf.push(match op {
1108                Match(_) => 'M',
1109                Ins(_) => 'I',
1110                Del(_) => 'D',
1111                RefSkip(_) => 'N',
1112                SoftClip(_) => 'S',
1113                HardClip(_) => 'H',
1114                Pad(_) => 'P',
1115                Equal(_) => '=',
1116                Diff(_) => 'X',
1117            });
1118        }
1119
1120        if let Some(ref cs) = self.cs_ops {
1121            buf.push_str("\tcs:Z:");
1122            for op in &cs.ops {
1123                match op {
1124                    CsOp::Matches(n) => {
1125                        buf.push(':');
1126                        push_u32(buf, *n);
1127                    }
1128                    CsOp::MatchSeq(off, len) => {
1129                        buf.push('=');
1130                        buf.push_str(std::str::from_utf8(cs.seq(*off, *len)).unwrap());
1131                    }
1132                    CsOp::Mismatch(r, q) => {
1133                        buf.push('*');
1134                        buf.push(*r as char);
1135                        buf.push(*q as char);
1136                    }
1137                    CsOp::Insertion(off, len) => {
1138                        buf.push('+');
1139                        buf.push_str(std::str::from_utf8(cs.seq(*off, *len)).unwrap());
1140                    }
1141                    CsOp::Deletion(off, len) => {
1142                        buf.push('-');
1143                        buf.push_str(std::str::from_utf8(cs.seq(*off, *len)).unwrap());
1144                    }
1145                }
1146            }
1147        }
1148    }
1149}
1150
1151impl fmt::Display for PafRecord {
1152    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1153        let mut buf = String::with_capacity(256);
1154        self.write_to_buf(&mut buf);
1155        f.write_str(&buf)
1156    }
1157}
1158
1159/// Helper for fast PAF output: locks stdout once, uses a BufWriter, and
1160/// reuses a single String buffer across all records.
1161pub struct PafWriter {
1162    out: std::io::BufWriter<std::io::StdoutLock<'static>>,
1163    buf: String,
1164}
1165
1166impl PafWriter {
1167    pub fn new() -> Self {
1168        PafWriter {
1169            out: std::io::BufWriter::with_capacity(
1170                64 * 1024,
1171                std::io::stdout().lock(),
1172            ),
1173            buf: String::with_capacity(512 * 1024),
1174        }
1175    }
1176
1177    pub fn write_rec(&mut self, rec: &PafRecord) {
1178        use std::io::Write;
1179        self.buf.clear();
1180        rec.write_to_buf(&mut self.buf);
1181        self.out.write_all(self.buf.as_bytes()).unwrap();
1182        self.out.write_all(b"\n").unwrap();
1183    }
1184}
1185
1186pub fn consumes_reference(cigar_opt: &Cigar) -> bool {
1187    matches!(
1188        cigar_opt,
1189        Match(_i) | Del(_i) | RefSkip(_i) | Diff(_i) | Equal(_i)
1190    )
1191}
1192/// # Example
1193/// ```
1194/// use rustybam::paf;
1195/// use rust_htslib::bam::record::Cigar::*;
1196/// assert!(paf::consumes_query(&Diff(5)));
1197/// ```
1198pub fn consumes_query(cigar_opt: &Cigar) -> bool {
1199    matches!(
1200        cigar_opt,
1201        Match(_i) | Ins(_i) | SoftClip(_i) | Diff(_i) | Equal(_i)
1202    )
1203}
1204
1205/// # Example
1206/// ```
1207/// use rustybam::paf;
1208/// use rust_htslib::bam::record::Cigar::*;
1209/// assert!(paf::is_match(&Match(5)));
1210/// assert!(paf::is_match(&Diff(5)));
1211/// assert!(paf::is_match(&Equal(5)));
1212/// ```
1213pub fn is_match(cigar_opt: &Cigar) -> bool {
1214    matches!(cigar_opt, Match(_i) | Diff(_i) | Equal(_i))
1215}
1216
1217/// # Example
1218/// ```
1219/// use rustybam::paf;
1220/// use rust_htslib::bam::record::Cigar::*;
1221/// assert_eq!(Diff(5), paf::update_cigar_opt_len(&Diff(10), 5));
1222/// assert_eq!(Diff(10), paf::update_cigar_opt_len(&Diff(1), 10));
1223/// ```
1224pub fn update_cigar_opt_len(opt: &Cigar, new_opt_len: u32) -> Cigar {
1225    match opt {
1226        Match(_) => Match(new_opt_len),
1227        Ins(_) => Ins(new_opt_len),
1228        Del(_) => Del(new_opt_len),
1229        RefSkip(_) => RefSkip(new_opt_len),
1230        HardClip(_) => HardClip(new_opt_len),
1231        SoftClip(_) => SoftClip(new_opt_len),
1232        Pad(_) => Pad(new_opt_len),
1233        Equal(_) => Equal(new_opt_len),
1234        Diff(_) => Diff(new_opt_len),
1235    }
1236}
1237
1238/// Create a CigarString from given str.
1239/// # Example
1240/// ```
1241/// use rustybam::paf;
1242/// use rust_htslib::bam::record::*;
1243/// use rust_htslib::bam::record::CigarString;
1244/// use rust_htslib::bam::record::Cigar::*;
1245/// use std::convert::TryFrom;
1246/// use std::str::FromStr;
1247/// let cigars = vec!["10M4D100I1102=", "100000M20=5P10X4M"];
1248/// for cigar_str in cigars{
1249///     let my_parse = paf::cigar_from_str(cigar_str).expect("Unable to parse cigar");
1250///     let hts_parse = CigarString::try_from(cigar_str).expect("Unable to parse cigar");
1251///     assert_eq!(my_parse, hts_parse);
1252/// }
1253/// ```
1254pub fn cigar_from_str(text: &str) -> PafResult<CigarString> {
1255    let bytes = text.as_bytes();
1256    let mut inner = Vec::new();
1257    let mut i = 0;
1258    let text_len = text.len();
1259    while i < text_len {
1260        let mut j = i;
1261        while j < text_len && bytes[j].is_ascii_digit() {
1262            j += 1;
1263        }
1264        let n = u32::from_str(&text[i..j]).map_err(|_| Error::PafParseCigar {
1265            msg: "expected integer".to_owned(),
1266        })?;
1267        let op = &text[j..j + 1];
1268        inner.push(match op {
1269            "M" => Cigar::Match(n),
1270            "I" => Cigar::Ins(n),
1271            "D" => Cigar::Del(n),
1272            "N" => Cigar::RefSkip(n),
1273            "H" => Cigar::HardClip(n),
1274            "S" => Cigar::SoftClip(n),
1275            "P" => Cigar::Pad(n),
1276            "=" => Cigar::Equal(n),
1277            "X" => Cigar::Diff(n),
1278            op => {
1279                return Err(Error::PafParseCigar {
1280                    msg: format!("Cannot parse opt: {}", op),
1281                })
1282            }
1283        });
1284        i = j + 1;
1285    }
1286    Ok(CigarString(inner))
1287}
1288
1289/// Basically swaps the query and the reference in a cigar
1290pub fn cigar_swap_target_query(cigar: &CigarString, strand: char) -> CigarString {
1291    // flip cigar
1292    let mut new_cigar = Vec::new();
1293    for opt in cigar.into_iter() {
1294        let new_opt = match opt {
1295            Ins(l) => Del(*l),
1296            Del(l) => Ins(*l),
1297            _ => *opt,
1298        };
1299        new_cigar.push(new_opt);
1300    }
1301    if strand == '-' {
1302        new_cigar.reverse();
1303    }
1304    CigarString(new_cigar)
1305}
1306
1307/// Swaps the query and reference and inverts the cigar sting
1308pub fn paf_swap_query_and_target(paf: &PafRecord) -> PafRecord {
1309    let mut flipped = paf.clone();
1310    // flip target
1311    flipped.t_name = paf.q_name.clone();
1312    flipped.t_len = paf.q_len;
1313    flipped.t_st = paf.q_st;
1314    flipped.t_en = paf.q_en;
1315    // flip query
1316    flipped.q_name = paf.t_name.clone();
1317    flipped.q_len = paf.t_len;
1318    flipped.q_st = paf.t_st;
1319    flipped.q_en = paf.t_en;
1320
1321    // flip the cigar
1322    flipped.cigar = cigar_swap_target_query(&paf.cigar, paf.strand);
1323
1324    // flip cs_ops: swap Insertion ↔ Deletion
1325    flipped.cs_ops = paf.cs_ops.as_ref().map(|cs| {
1326        let mut new_ops: Vec<CsOp> = cs
1327            .ops
1328            .iter()
1329            .map(|op| match op {
1330                CsOp::Insertion(off, len) => CsOp::Deletion(*off, *len),
1331                CsOp::Deletion(off, len) => CsOp::Insertion(*off, *len),
1332                other => *other,
1333            })
1334            .collect();
1335        if paf.strand == '-' {
1336            new_ops.reverse();
1337        }
1338        CsOps {
1339            ops: new_ops,
1340            seq_data: cs.seq_data.clone(),
1341        }
1342    });
1343
1344    flipped
1345}
1346
1347pub fn make_fake_paf_rec() -> PafRecord {
1348    PafRecord::new("Q 10 2 10 - T 20 12 20 3 9 60 cg:Z:4M1I1D3=").unwrap()
1349}
1350
1351/// Parse a cs-tag string into both a CigarString and a CsOps (ops + shared seq buffer).
1352/// Supports both long form (`=ACGT`, `*at`) and short form (`:10`).
1353///
1354/// All sequence data is stored in a single contiguous `Vec<u8>` — one allocation
1355/// instead of thousands of per-op `Vec<u8>` allocations.
1356///
1357/// # Example
1358/// ```
1359/// use rust_htslib::bam::record::Cigar::*;
1360/// use rustybam::paf;
1361/// let (cigar, cs) = paf::parse_cs_string(":10=ACGTN+acgtn-acgtn*at=A").unwrap();
1362/// assert_eq!(cigar[0], Equal(10));
1363/// assert_eq!(cigar[1], Equal(5));
1364/// assert_eq!(cigar[2], Ins(5));
1365/// assert_eq!(cigar[3], Del(5));
1366/// assert_eq!(cigar[4], Diff(1));
1367/// assert_eq!(cigar[5], Equal(1));
1368/// assert_eq!(cs.ops.len(), cigar.len());
1369/// ```
1370pub fn parse_cs_string(cs: &str) -> PafResult<(CigarString, CsOps)> {
1371    let bytes = cs.as_bytes();
1372    let length = bytes.len();
1373    let mut i = 0;
1374    // Pre-allocate: each CS op is at least 2 chars, so len/2 is an upper bound
1375    let estimated_ops = length / 2;
1376    let mut cigar = Vec::with_capacity(estimated_ops);
1377    let mut ops = Vec::with_capacity(estimated_ops);
1378    // Single buffer for all sequence data — upper bound is the full cs string length
1379    let mut seq_data: Vec<u8> = Vec::with_capacity(length);
1380    while i < length {
1381        let cs_opt = bytes[i];
1382        i += 1; // past the operator
1383        match cs_opt {
1384            b'=' => {
1385                let start = i;
1386                while i < length && matches!(bytes[i], b'A' | b'C' | b'G' | b'T' | b'N') {
1387                    i += 1;
1388                }
1389                let l = (i - start) as u32;
1390                let offset = seq_data.len() as u32;
1391                seq_data.extend_from_slice(&bytes[start..i]);
1392                cigar.push(Cigar::Equal(l));
1393                ops.push(CsOp::MatchSeq(offset, l));
1394            }
1395            b':' => {
1396                let start = i;
1397                while i < length && bytes[i].is_ascii_digit() {
1398                    i += 1;
1399                }
1400                let l = u32::from_str(&cs[start..i]).map_err(|_| Error::ParseIntError {
1401                    msg: "Expected integer in cs :N operator".to_string(),
1402                })?;
1403                cigar.push(Cigar::Equal(l));
1404                ops.push(CsOp::Matches(l));
1405            }
1406            b'*' => {
1407                let ref_base = bytes[i];
1408                let query_base = bytes[i + 1];
1409                i += 2;
1410                cigar.push(Cigar::Diff(1));
1411                ops.push(CsOp::Mismatch(ref_base, query_base));
1412            }
1413            b'+' => {
1414                let start = i;
1415                while i < length && bytes[i].is_ascii_lowercase() {
1416                    i += 1;
1417                }
1418                let l = (i - start) as u32;
1419                let offset = seq_data.len() as u32;
1420                seq_data.extend_from_slice(&bytes[start..i]);
1421                cigar.push(Cigar::Ins(l));
1422                ops.push(CsOp::Insertion(offset, l));
1423            }
1424            b'-' => {
1425                let start = i;
1426                while i < length && bytes[i].is_ascii_lowercase() {
1427                    i += 1;
1428                }
1429                let l = (i - start) as u32;
1430                let offset = seq_data.len() as u32;
1431                seq_data.extend_from_slice(&bytes[start..i]);
1432                cigar.push(Cigar::Del(l));
1433                ops.push(CsOp::Deletion(offset, l));
1434            }
1435            b'~' => {
1436                return Err(Error::PafParseCS {
1437                    msg: "Splice operations not yet supported.".to_string(),
1438                });
1439            }
1440            _ => {
1441                return Err(Error::PafParseCS {
1442                    msg: format!("Unexpected operator in the cs string: {}", cs_opt as char),
1443                });
1444            }
1445        }
1446    }
1447    Ok((CigarString(cigar), CsOps { ops, seq_data }))
1448}
1449
1450/// Parse a cs-tag string into a CigarString (convenience wrapper).
1451///
1452/// # Example
1453/// ```
1454/// use rust_htslib::bam::record::Cigar::*;
1455/// use rustybam::paf;
1456/// let cigar = paf::cs_to_cigar(":10=ACGTN+acgtn-acgtn*at=A").unwrap();
1457/// assert_eq!(cigar[0], Equal(10));
1458/// assert_eq!(cigar[1], Equal(5));
1459/// assert_eq!(cigar[2], Ins(5));
1460/// assert_eq!(cigar[3], Del(5));
1461/// assert_eq!(cigar[4], Diff(1));
1462/// assert_eq!(cigar[5], Equal(1));
1463/// ```
1464pub fn cs_to_cigar(cs: &str) -> PafResult<CigarString> {
1465    cigar_from_cs(cs)
1466}
1467
1468/// Fast cs-tag to CigarString parser. Only extracts operation types and lengths
1469/// without allocating sequence data. Use `parse_cs_string` when you need full
1470/// base-level CsOp detail.
1471///
1472/// # Example
1473/// ```
1474/// use rust_htslib::bam::record::Cigar::*;
1475/// use rustybam::paf;
1476/// let cigar = paf::cigar_from_cs(":10=ACGTN+acgtn-acgtn*at=A").unwrap();
1477/// assert_eq!(cigar[0], Equal(10));
1478/// assert_eq!(cigar[1], Equal(5));
1479/// assert_eq!(cigar[2], Ins(5));
1480/// assert_eq!(cigar[3], Del(5));
1481/// assert_eq!(cigar[4], Diff(1));
1482/// assert_eq!(cigar[5], Equal(1));
1483/// ```
1484pub fn cigar_from_cs(cs: &str) -> PafResult<CigarString> {
1485    let bytes = cs.as_bytes();
1486    let length = bytes.len();
1487    let mut i = 0;
1488    let estimated_ops = length / 2;
1489    let mut cigar = Vec::with_capacity(estimated_ops);
1490    while i < length {
1491        let cs_opt = bytes[i];
1492        i += 1;
1493        match cs_opt {
1494            b'=' => {
1495                let start = i;
1496                while i < length && matches!(bytes[i], b'A' | b'C' | b'G' | b'T' | b'N') {
1497                    i += 1;
1498                }
1499                cigar.push(Cigar::Equal((i - start) as u32));
1500            }
1501            b':' => {
1502                let start = i;
1503                while i < length && bytes[i].is_ascii_digit() {
1504                    i += 1;
1505                }
1506                let l = u32::from_str(&cs[start..i]).map_err(|_| Error::ParseIntError {
1507                    msg: "Expected integer in cs :N operator".to_string(),
1508                })?;
1509                cigar.push(Cigar::Equal(l));
1510            }
1511            b'*' => {
1512                i += 2;
1513                cigar.push(Cigar::Diff(1));
1514            }
1515            b'+' => {
1516                let start = i;
1517                while i < length && bytes[i].is_ascii_lowercase() {
1518                    i += 1;
1519                }
1520                cigar.push(Cigar::Ins((i - start) as u32));
1521            }
1522            b'-' => {
1523                let start = i;
1524                while i < length && bytes[i].is_ascii_lowercase() {
1525                    i += 1;
1526                }
1527                cigar.push(Cigar::Del((i - start) as u32));
1528            }
1529            b'~' => {
1530                return Err(Error::PafParseCS {
1531                    msg: "Splice operations not yet supported.".to_string(),
1532                });
1533            }
1534            _ => {
1535                return Err(Error::PafParseCS {
1536                    msg: format!("Unexpected operator in the cs string: {}", cs_opt as char),
1537                });
1538            }
1539        }
1540    }
1541    Ok(CigarString(cigar))
1542}