1use itertools::Itertools;
31use serde::{Deserialize, Serialize};
32use std::fmt::Write;
33
34use crate::*;
35
36#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone, Copy)]
38pub enum CigarOp {
39 Match,
41 Sub,
43 Del,
45 Ins,
47}
48
49#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone, Copy)]
51pub enum CigarOpChars {
52 Match(u8),
54 Sub(u8, u8),
56 Del(u8),
58 Ins(u8),
60}
61
62#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone, Copy)]
64pub struct CigarElem {
65 pub op: CigarOp,
66 pub cnt: I,
67}
68
69impl CigarElem {
70 pub fn new(op: CigarOp, cnt: I) -> Self {
71 Self { op, cnt }
72 }
73}
74
75#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone, Default)]
81pub struct Cigar {
82 pub ops: Vec<CigarElem>,
83}
84
85impl CigarOp {
86 pub fn to_char(&self) -> char {
88 match self {
89 CigarOp::Match => '=',
90 CigarOp::Sub => 'X',
91 CigarOp::Ins => 'I',
92 CigarOp::Del => 'D',
93 }
94 }
95
96 #[inline(always)]
98 pub fn delta(&self) -> Pos {
99 match self {
100 CigarOp::Match | CigarOp::Sub => Pos(1, 1),
101 CigarOp::Del => Pos(1, 0), CigarOp::Ins => Pos(0, 1), }
104 }
105
106 #[inline(always)]
110 pub fn edit_cost(&self) -> Cost {
111 match self {
112 CigarOp::Match => 0,
113 _ => 1,
114 }
115 }
116
117 #[inline(always)]
121 pub fn from_delta(delta: Pos) -> Self {
122 match delta {
123 Pos(0, 1) => CigarOp::Ins,
124 Pos(1, 0) => CigarOp::Del,
125 Pos(1, 1) => CigarOp::Match,
126 _ => panic!("Invalid delta: {:?}", delta),
127 }
128 }
129}
130
131impl std::ops::Mul<I> for Pos {
133 type Output = Pos;
134 fn mul(self, rhs: I) -> Pos {
135 Pos(self.0 * rhs, self.1 * rhs)
136 }
137}
138
139impl From<u8> for CigarOp {
140 fn from(op: u8) -> Self {
144 match op {
145 b'=' | b'M' => CigarOp::Match,
146 b'X' => CigarOp::Sub,
147 b'I' => CigarOp::Ins,
148 b'D' => CigarOp::Del,
149 _ => panic!("Invalid CigarOp"),
150 }
151 }
152}
153
154impl ToString for Cigar {
155 fn to_string(&self) -> String {
157 let mut s = String::new();
158 for elem in &self.ops {
159 write!(&mut s, "{}{}", elem.cnt, elem.op.to_char()).unwrap();
160 }
161 s
162 }
163}
164
165impl Cigar {
166 pub fn from_ops(ops: impl Iterator<Item = CigarOp>) -> Self {
168 Cigar {
169 ops: ops
170 .chunk_by(|&op| op)
171 .into_iter()
172 .map(|(op, group)| CigarElem::new(op, group.count() as _))
173 .collect(),
174 }
175 }
176
177 pub fn from_path(text: Seq, pattern: Seq, path: &Path) -> Cigar {
183 if path[0] != Pos(0, 0) {
184 panic!("Path must start at (0,0)!");
185 }
186 Self::resolve_matches(
187 path.iter()
188 .tuple_windows()
189 .map(|(&text_pos, &pattern_pos)| {
190 CigarElem::new(CigarOp::from_delta(pattern_pos - text_pos), 1)
191 }),
192 text,
193 pattern,
194 )
195 }
196
197 pub fn to_char_pairs<'s>(&'s self, text: &'s [u8], pattern: &'s [u8]) -> Vec<CigarOpChars> {
199 let mut pos = Pos(0, 0);
200 let fix_case = !(b'A' ^ b'a');
201 let mut out = vec![];
202 for el in &self.ops {
203 for _ in 0..el.cnt {
204 let c = match el.op {
205 CigarOp::Match => {
206 CigarOpChars::Match(text[pos.0 as usize])
213 }
214 CigarOp::Sub => {
215 assert_ne!(
216 (text[pos.0 as usize] & fix_case) as char,
217 (pattern[pos.1 as usize] & fix_case) as char,
218 "cigar {:?}\npattern {:?}\ntext {:?}\nmismatch for {pos:?}",
219 self.to_string(),
220 String::from_utf8_lossy(pattern),
221 String::from_utf8_lossy(text)
222 );
223 CigarOpChars::Sub(text[pos.0 as usize], pattern[pos.1 as usize])
224 }
225 CigarOp::Del => {
226 CigarOpChars::Del(text[pos.0 as usize])
228 }
229 CigarOp::Ins => {
230 CigarOpChars::Ins(pattern[pos.1 as usize])
232 }
233 };
234 out.push(c);
235 pos += el.op.delta();
236 }
237 }
238 out
239 }
240
241 pub fn to_path(&self) -> Path {
243 let mut pos = Pos(0, 0);
244 let mut path = vec![pos];
245 for el in &self.ops {
246 for _ in 0..el.cnt {
247 pos += el.op.delta();
248 path.push(pos);
249 }
250 }
251 path
252 }
253
254 pub fn to_path_with_costs(&self, cm: CostModel) -> Vec<(Pos, Cost)> {
256 let mut pos = Pos(0, 0);
257 let mut cost = 0;
258 let mut path = vec![(pos, cost)];
259
260 for el in &self.ops {
261 match el.op {
262 CigarOp::Match => {
263 for _ in 0..el.cnt {
264 pos += el.op.delta();
265 path.push((pos, cost));
266 }
267 }
268 CigarOp::Sub => {
269 for _ in 0..el.cnt {
270 pos += el.op.delta();
271 cost += cm.sub;
272 path.push((pos, cost));
273 }
274 }
275 CigarOp::Ins => {
276 for len in 1..=(el.cnt as Cost) {
277 pos += el.op.delta();
278 path.push((pos, cost + cm.ins(len)));
279 }
280 cost += cm.ins(el.cnt);
281 }
282 CigarOp::Del => {
283 for len in 1..=(el.cnt as Cost) {
284 pos += el.op.delta();
285 path.push((pos, cost + cm.del(len)));
286 }
287 cost += cm.del(el.cnt);
288 }
289 }
290 }
291 path
292 }
293
294 pub fn push(&mut self, op: CigarOp) {
296 if let Some(s) = self.ops.last_mut() {
297 if s.op == op {
298 s.cnt += 1;
299 return;
300 }
301 }
302 self.ops.push(CigarElem { op, cnt: 1 });
303 }
304
305 pub fn pop_op(&mut self) -> Option<CigarOp> {
307 while let Some(elem) = self.ops.last_mut() {
308 let op = elem.op;
309 assert!(elem.cnt > 0);
310 elem.cnt -= 1;
311 if elem.cnt == 0 {
312 self.ops.pop();
313 }
314 return Some(op);
315 }
316 None
317 }
318
319 pub fn push_elem(&mut self, e: CigarElem) {
321 if let Some(s) = self.ops.last_mut() {
322 if s.op == e.op {
323 s.cnt += e.cnt;
324 return;
325 }
326 }
327 self.ops.push(e);
328 }
329
330 pub fn push_matches(&mut self, cnt: I) {
332 if let Some(s) = self.ops.last_mut() {
333 if s.op == CigarOp::Match {
334 s.cnt += cnt;
335 return;
336 }
337 }
338 self.ops.push(CigarElem {
339 op: CigarOp::Match,
340 cnt: cnt as _,
341 });
342 }
343
344 pub fn verify(&self, cm: &CostModel, text: Seq, pattern: Seq) -> Result<Cost, &str> {
346 let mut pos = Pos(0, 0);
347 let mut cost: Cost = 0;
348
349 for &CigarElem { op, cnt } in &self.ops {
350 match op {
351 CigarOp::Match => {
352 for _ in 0..cnt {
353 if text.get(pos.0 as usize) != pattern.get(pos.1 as usize) {
354 return Err("Expected match but found substitution.");
355 }
356 pos += op.delta();
357 }
358 }
359 CigarOp::Sub => {
360 for _ in 0..cnt {
361 if text.get(pos.0 as usize) == pattern.get(pos.1 as usize) {
362 return Err("Expected substitution but found match.");
363 }
364 pos += op.delta();
365 cost += cm.sub;
366 }
367 }
368 CigarOp::Ins => {
369 cost += cm.open + cnt as Cost * cm.extend;
370 pos += op.delta() * cnt;
371 }
372 CigarOp::Del => {
373 cost += cm.open + cnt as Cost * cm.extend;
374 pos += op.delta() * cnt;
375 }
376 }
377 }
378 if pos != Pos(text.len() as I, pattern.len() as I) {
379 return Err("Wrong alignment length.");
380 }
381
382 Ok(cost)
383 }
384
385 pub fn resolve_matches(ops: impl Iterator<Item = CigarElem>, text: Seq, pattern: Seq) -> Self {
387 let mut pos = Pos(0, 0);
388 let mut c = Cigar { ops: vec![] };
389 for CigarElem { op, cnt } in ops {
390 match op {
391 CigarOp::Match => {
392 for _ in 0..cnt {
393 c.push(if text[pos.0 as usize] == pattern[pos.1 as usize] {
394 CigarOp::Match
395 } else {
396 CigarOp::Sub
397 });
398 pos += op.delta();
399 }
400 continue;
401 }
402 _ => {
403 pos += op.delta() * cnt;
404 }
405 };
406 c.push_elem(CigarElem { op, cnt });
407 }
408 c
409 }
410
411 pub fn parse_without_counts(s: &str, text: Seq, pattern: Seq) -> Self {
414 Self::resolve_matches(
415 s.as_bytes().iter().map(|&op| CigarElem {
416 op: op.into(),
417 cnt: 1,
418 }),
419 text,
420 pattern,
421 )
422 }
423
424 pub fn parse_without_resolving(s: &str) -> Self {
427 let mut c = Cigar { ops: vec![] };
428 for &op in s.as_bytes() {
429 c.push(op.into())
430 }
431 c
432 }
433
434 pub fn from_string(s: &str) -> Self {
436 let mut c = Cigar { ops: vec![] };
437 for slice in s.as_bytes().split_inclusive(|b| !b.is_ascii_digit()) {
438 let (&op, cnt_bytes) = slice.split_last().expect("Cigar string cannot be empty");
439 let cnt = if cnt_bytes.is_empty() {
440 1
441 } else {
442 unsafe { std::str::from_utf8_unchecked(cnt_bytes) }
443 .parse()
444 .expect("Invalid Cigar count")
445 };
446 c.push_elem(CigarElem { op: op.into(), cnt });
447 }
448 c
449 }
450
451 pub fn parse(s: &str, text: Seq, pattern: Seq) -> Self {
454 Self::resolve_matches(
455 s.as_bytes()
456 .split_inclusive(|pattern| pattern.is_ascii_alphabetic())
457 .map(|pattern_slice| {
458 let (&op, cnt) = pattern_slice.split_last().unwrap();
459 let cnt = if cnt.is_empty() {
460 1
461 } else {
462 unsafe { std::str::from_utf8_unchecked(cnt) }
463 .parse()
464 .unwrap()
465 };
466 CigarElem { op: op.into(), cnt }
467 }),
468 text,
469 pattern,
470 )
471 }
472
473 pub fn clear(&mut self) {
475 self.ops.clear();
476 }
477
478 pub fn reverse(&mut self) {
480 self.ops.reverse();
481 }
482}
483
484#[cfg(test)]
485mod tests {
486 use super::*;
487
488 #[test]
489 fn test_delta() {
490 for op in [CigarOp::Match, CigarOp::Del, CigarOp::Ins] {
491 assert_eq!(CigarOp::from_delta(op.delta()), op);
492 }
493 assert_eq!(CigarOp::from_delta(Pos(1, 1)), CigarOp::Match);
494 assert_eq!(CigarOp::from_delta(Pos(1, 0)), CigarOp::Del); assert_eq!(CigarOp::from_delta(Pos(0, 1)), CigarOp::Ins); }
498
499 #[test]
500 fn test_valid_eq() {
501 let c = Cigar::from_path(b"ab", b"aa", &vec![Pos(0, 0), Pos(1, 1), Pos(2, 2)]);
502 assert_eq!(c.to_string(), "1=1X");
503 }
504
505 #[test]
506 fn test_invalid_end_length() {
507 let c = Cigar::from_path(b"ab", b"aa", &vec![Pos(0, 0), Pos(0, 1)]);
509 assert!(c.verify(&CostModel::unit(), b"ab", b"aa").is_err());
510 }
511
512 #[test]
513 fn to_string() {
514 let c = Cigar {
515 ops: vec![
516 CigarElem {
517 op: CigarOp::Ins,
518 cnt: 1,
519 },
520 CigarElem {
521 op: CigarOp::Match,
522 cnt: 2,
523 },
524 ],
525 };
526 assert_eq!(c.to_string(), "1I2=");
527 }
528
529 #[test]
530 fn from_path() {
531 let c = Cigar::from_path(
532 b"aaa",
533 b"aabc",
534 &vec![Pos(0, 0), Pos(1, 1), Pos(2, 2), Pos(3, 3), Pos(3, 4)],
535 );
536 assert_eq!(c.to_string(), "2=1X1I");
537 }
538
539 #[test]
540 fn from_string_with_count() {
541 let c = Cigar::from_string("24=");
542 assert_eq!(c.ops.len(), 1);
543 assert_eq!(c.ops[0], CigarElem::new(CigarOp::Match, 24));
544 }
545
546 #[test]
547 fn from_string_mixed_ops() {
548 let c = Cigar::from_string("2=3I1X");
549 assert_eq!(
550 c.ops,
551 vec![
552 CigarElem::new(CigarOp::Match, 2),
553 CigarElem::new(CigarOp::Ins, 3),
554 CigarElem::new(CigarOp::Sub, 1)
555 ]
556 );
557 }
558
559 #[test]
560 fn from_string_no_counts() {
561 let c = Cigar::from_string("=XIDDD");
562 assert_eq!(
563 c.ops,
564 vec![
565 CigarElem::new(CigarOp::Match, 1),
566 CigarElem::new(CigarOp::Sub, 1),
567 CigarElem::new(CigarOp::Ins, 1),
568 CigarElem::new(CigarOp::Del, 3),
569 ]
570 );
571 }
572
573 #[test]
574 #[rustfmt::skip]
575 fn push_to_path() {
576 let mut c = Cigar::default();
577 c.push(CigarOp::Match); c.push(CigarOp::Del); c.push(CigarOp::Ins); c.push(CigarOp::Sub); assert_eq!(
584 c.to_path(),
585 [
586 Pos(0, 0),
587 Pos(1, 1),
588 Pos(2, 1),
589 Pos(2, 2),
590 Pos(3, 3),
591 ]
592 );
593 }
594
595 #[test]
596 fn to_char_pairs_all_match() {
597 let c = Cigar::from_string("3=");
598 let pairs = c.to_char_pairs(b"aaa", b"aaa");
599 assert_eq!(
600 pairs,
601 vec![
602 CigarOpChars::Match(b'a'),
603 CigarOpChars::Match(b'a'),
604 CigarOpChars::Match(b'a'),
605 ]
606 );
607 }
608
609 #[test]
610 fn to_char_pairs_sub() {
611 let c = Cigar::from_string("1X");
612 let pairs = c.to_char_pairs(b"a", b"c");
613 assert_eq!(pairs, vec![CigarOpChars::Sub(b'a', b'c')]);
614 }
615
616 #[test]
617 fn to_char_pairs_ins() {
618 let c = Cigar::from_string("1=1I1=");
619 let pairs = c.to_char_pairs(b"ac", b"abc");
620 assert_eq!(
621 pairs,
622 vec![
623 CigarOpChars::Match(b'a'),
624 CigarOpChars::Ins(b'b'),
625 CigarOpChars::Match(b'c'),
626 ]
627 );
628 }
629
630 #[test]
631 fn to_char_pairs_del() {
632 let c = Cigar::from_string("1=1D1=");
633 let pairs = c.to_char_pairs(b"abc", b"ac");
634 assert_eq!(
635 pairs,
636 vec![
637 CigarOpChars::Match(b'a'),
638 CigarOpChars::Del(b'b'),
639 CigarOpChars::Match(b'c'),
640 ]
641 );
642 }
643
644 #[test]
645 fn to_char_pairs_mixed() {
646 let c = Cigar::from_string("2=1X1I");
647 let pairs = c.to_char_pairs(b"abZd", b"abYc");
648 assert_eq!(
649 pairs,
650 vec![
651 CigarOpChars::Match(b'a'),
652 CigarOpChars::Match(b'b'),
653 CigarOpChars::Sub(b'Z', b'Y'),
654 CigarOpChars::Ins(b'c'),
655 ]
656 );
657 }
658}