1use itertools::Itertools;
31use serde::{Deserialize, Serialize};
32use std::fmt::Write;
33
34use crate::*;
35
36#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone, Copy)]
38pub enum CigarOp {
39 Match,
41 Sub,
43 Del,
45 Ins,
47}
48
49#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone, Copy)]
51pub enum CigarOpChars {
52 Match(u8),
53 Sub(u8, u8),
54 Del(u8),
55 Ins(u8),
56}
57
58#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone, Copy)]
60pub struct CigarElem {
61 pub op: CigarOp,
62 pub cnt: I,
63}
64
65impl CigarElem {
66 pub fn new(op: CigarOp, cnt: I) -> Self {
67 Self { op, cnt }
68 }
69}
70
71#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone, Default)]
77pub struct Cigar {
78 pub ops: Vec<CigarElem>,
79}
80
81impl CigarOp {
82 pub fn to_char(&self) -> char {
84 match self {
85 CigarOp::Match => '=',
86 CigarOp::Sub => 'X',
87 CigarOp::Ins => 'I',
88 CigarOp::Del => 'D',
89 }
90 }
91
92 #[inline(always)]
94 pub fn delta(&self) -> Pos {
95 match self {
96 CigarOp::Match | CigarOp::Sub => Pos(1, 1),
97 CigarOp::Del => Pos(1, 0), CigarOp::Ins => Pos(0, 1), }
100 }
101
102 #[inline(always)]
106 pub fn from_delta(delta: Pos) -> Self {
107 match delta {
108 Pos(0, 1) => CigarOp::Ins,
109 Pos(1, 0) => CigarOp::Del,
110 Pos(1, 1) => CigarOp::Match,
111 _ => panic!("Invalid delta: {:?}", delta),
112 }
113 }
114}
115
116impl std::ops::Mul<I> for Pos {
118 type Output = Pos;
119 fn mul(self, rhs: I) -> Pos {
120 Pos(self.0 * rhs, self.1 * rhs)
121 }
122}
123
124impl From<u8> for CigarOp {
125 fn from(op: u8) -> Self {
129 match op {
130 b'=' | b'M' => CigarOp::Match,
131 b'X' => CigarOp::Sub,
132 b'I' => CigarOp::Ins,
133 b'D' => CigarOp::Del,
134 _ => panic!("Invalid CigarOp"),
135 }
136 }
137}
138
139impl ToString for Cigar {
140 fn to_string(&self) -> String {
142 let mut s = String::new();
143 for elem in &self.ops {
144 write!(&mut s, "{}{}", elem.cnt, elem.op.to_char()).unwrap();
145 }
146 s
147 }
148}
149
150impl Cigar {
151 pub fn from_ops(ops: impl Iterator<Item = CigarOp>) -> Self {
153 Cigar {
154 ops: ops
155 .chunk_by(|&op| op)
156 .into_iter()
157 .map(|(op, group)| CigarElem::new(op, group.count() as _))
158 .collect(),
159 }
160 }
161
162 pub fn from_path(text: Seq, pattern: Seq, path: &Path) -> Cigar {
168 if path[0] != Pos(0, 0) {
169 panic!("Path must start at (0,0)!");
170 }
171 Self::resolve_matches(
172 path.iter()
173 .tuple_windows()
174 .map(|(&text_pos, &pattern_pos)| {
175 CigarElem::new(CigarOp::from_delta(pattern_pos - text_pos), 1)
176 }),
177 text,
178 pattern,
179 )
180 }
181
182 pub fn to_char_pairs<'s>(&'s self, text: &'s [u8], pattern: &'s [u8]) -> Vec<CigarOpChars> {
184 let mut pos = Pos(0, 0);
185 let fix_case = !(b'A' ^ b'a');
186 let mut out = vec![];
187 for el in &self.ops {
188 for _ in 0..el.cnt {
189 let c = match el.op {
190 CigarOp::Match => {
191 CigarOpChars::Match(text[pos.0 as usize])
198 }
199 CigarOp::Sub => {
200 assert_ne!(
201 (text[pos.0 as usize] & fix_case) as char,
202 (pattern[pos.1 as usize] & fix_case) as char,
203 "cigar {:?}\npattern {:?}\ntext {:?}\nmismatch for {pos:?}",
204 self.to_string(),
205 String::from_utf8_lossy(pattern),
206 String::from_utf8_lossy(text)
207 );
208 CigarOpChars::Sub(text[pos.0 as usize], pattern[pos.1 as usize])
209 }
210 CigarOp::Del => {
211 CigarOpChars::Del(text[pos.0 as usize])
213 }
214 CigarOp::Ins => {
215 CigarOpChars::Ins(pattern[pos.1 as usize])
217 }
218 };
219 out.push(c);
220 pos += el.op.delta();
221 }
222 }
223 out
224 }
225
226 pub fn to_path(&self) -> Path {
228 let mut pos = Pos(0, 0);
229 let mut path = vec![pos];
230 for el in &self.ops {
231 for _ in 0..el.cnt {
232 pos += el.op.delta();
233 path.push(pos);
234 }
235 }
236 path
237 }
238
239 pub fn to_path_with_costs(&self, cm: CostModel) -> Vec<(Pos, Cost)> {
241 let mut pos = Pos(0, 0);
242 let mut cost = 0;
243 let mut path = vec![(pos, cost)];
244
245 for el in &self.ops {
246 match el.op {
247 CigarOp::Match => {
248 for _ in 0..el.cnt {
249 pos += el.op.delta();
250 path.push((pos, cost));
251 }
252 }
253 CigarOp::Sub => {
254 for _ in 0..el.cnt {
255 pos += el.op.delta();
256 cost += cm.sub;
257 path.push((pos, cost));
258 }
259 }
260 CigarOp::Ins => {
261 for len in 1..=(el.cnt as Cost) {
262 pos += el.op.delta();
263 path.push((pos, cost + cm.ins(len)));
264 }
265 cost += cm.ins(el.cnt);
266 }
267 CigarOp::Del => {
268 for len in 1..=(el.cnt as Cost) {
269 pos += el.op.delta();
270 path.push((pos, cost + cm.del(len)));
271 }
272 cost += cm.del(el.cnt);
273 }
274 }
275 }
276 path
277 }
278
279 pub fn push(&mut self, op: CigarOp) {
281 if let Some(s) = self.ops.last_mut() {
282 if s.op == op {
283 s.cnt += 1;
284 return;
285 }
286 }
287 self.ops.push(CigarElem { op, cnt: 1 });
288 }
289
290 pub fn push_elem(&mut self, e: CigarElem) {
292 if let Some(s) = self.ops.last_mut() {
293 if s.op == e.op {
294 s.cnt += e.cnt;
295 return;
296 }
297 }
298 self.ops.push(e);
299 }
300
301 pub fn push_matches(&mut self, cnt: I) {
303 if let Some(s) = self.ops.last_mut() {
304 if s.op == CigarOp::Match {
305 s.cnt += cnt;
306 return;
307 }
308 }
309 self.ops.push(CigarElem {
310 op: CigarOp::Match,
311 cnt: cnt as _,
312 });
313 }
314
315 pub fn verify(&self, cm: &CostModel, text: Seq, pattern: Seq) -> Result<Cost, &str> {
317 let mut pos = Pos(0, 0);
318 let mut cost: Cost = 0;
319
320 for &CigarElem { op, cnt } in &self.ops {
321 match op {
322 CigarOp::Match => {
323 for _ in 0..cnt {
324 if text.get(pos.0 as usize) != pattern.get(pos.1 as usize) {
325 return Err("Expected match but found substitution.");
326 }
327 pos += op.delta();
328 }
329 }
330 CigarOp::Sub => {
331 for _ in 0..cnt {
332 if text.get(pos.0 as usize) == pattern.get(pos.1 as usize) {
333 return Err("Expected substitution but found match.");
334 }
335 pos += op.delta();
336 cost += cm.sub;
337 }
338 }
339 CigarOp::Ins => {
340 cost += cm.open + cnt as Cost * cm.extend;
341 pos += op.delta() * cnt;
342 }
343 CigarOp::Del => {
344 cost += cm.open + cnt as Cost * cm.extend;
345 pos += op.delta() * cnt;
346 }
347 }
348 }
349 if pos != Pos(text.len() as I, pattern.len() as I) {
350 return Err("Wrong alignment length.");
351 }
352
353 Ok(cost)
354 }
355
356 pub fn resolve_matches(ops: impl Iterator<Item = CigarElem>, text: Seq, pattern: Seq) -> Self {
358 let mut pos = Pos(0, 0);
359 let mut c = Cigar { ops: vec![] };
360 for CigarElem { op, cnt } in ops {
361 match op {
362 CigarOp::Match => {
363 for _ in 0..cnt {
364 c.push(if text[pos.0 as usize] == pattern[pos.1 as usize] {
365 CigarOp::Match
366 } else {
367 CigarOp::Sub
368 });
369 pos += op.delta();
370 }
371 continue;
372 }
373 _ => {
374 pos += op.delta() * cnt;
375 }
376 };
377 c.push_elem(CigarElem { op, cnt });
378 }
379 c
380 }
381
382 pub fn parse_without_counts(s: &str, text: Seq, pattern: Seq) -> Self {
385 Self::resolve_matches(
386 s.as_bytes().iter().map(|&op| CigarElem {
387 op: op.into(),
388 cnt: 1,
389 }),
390 text,
391 pattern,
392 )
393 }
394
395 pub fn parse_without_resolving(s: &str) -> Self {
398 let mut c = Cigar { ops: vec![] };
399 for &op in s.as_bytes() {
400 c.push(op.into())
401 }
402 c
403 }
404
405 pub fn from_string(s: &str) -> Self {
407 let mut c = Cigar { ops: vec![] };
408 for slice in s.as_bytes().split_inclusive(|b| !b.is_ascii_digit()) {
409 let (&op, cnt_bytes) = slice.split_last().expect("Cigar string cannot be empty");
410 let cnt = if cnt_bytes.is_empty() {
411 1
412 } else {
413 unsafe { std::str::from_utf8_unchecked(cnt_bytes) }
414 .parse()
415 .expect("Invalid Cigar count")
416 };
417 c.push_elem(CigarElem { op: op.into(), cnt });
418 }
419 c
420 }
421
422 pub fn parse(s: &str, text: Seq, pattern: Seq) -> Self {
425 Self::resolve_matches(
426 s.as_bytes()
427 .split_inclusive(|pattern| pattern.is_ascii_alphabetic())
428 .map(|pattern_slice| {
429 let (&op, cnt) = pattern_slice.split_last().unwrap();
430 let cnt = if cnt.is_empty() {
431 1
432 } else {
433 unsafe { std::str::from_utf8_unchecked(cnt) }
434 .parse()
435 .unwrap()
436 };
437 CigarElem { op: op.into(), cnt }
438 }),
439 text,
440 pattern,
441 )
442 }
443
444 pub fn clear(&mut self) {
446 self.ops.clear();
447 }
448
449 pub fn reverse(&mut self) {
451 self.ops.reverse();
452 }
453}
454
455#[cfg(test)]
456mod tests {
457 use super::*;
458
459 #[test]
460 fn test_delta() {
461 for op in [CigarOp::Match, CigarOp::Del, CigarOp::Ins] {
462 assert_eq!(CigarOp::from_delta(op.delta()), op);
463 }
464 assert_eq!(CigarOp::from_delta(Pos(1, 1)), CigarOp::Match);
465 assert_eq!(CigarOp::from_delta(Pos(1, 0)), CigarOp::Del); assert_eq!(CigarOp::from_delta(Pos(0, 1)), CigarOp::Ins); }
469
470 #[test]
471 fn test_valid_eq() {
472 let c = Cigar::from_path(b"ab", b"aa", &vec![Pos(0, 0), Pos(1, 1), Pos(2, 2)]);
473 assert_eq!(c.to_string(), "1=1X");
474 }
475
476 #[test]
477 fn test_invalid_end_length() {
478 let c = Cigar::from_path(b"ab", b"aa", &vec![Pos(0, 0), Pos(0, 1)]);
480 assert!(c.verify(&CostModel::unit(), b"ab", b"aa").is_err());
481 }
482
483 #[test]
484 fn to_string() {
485 let c = Cigar {
486 ops: vec![
487 CigarElem {
488 op: CigarOp::Ins,
489 cnt: 1,
490 },
491 CigarElem {
492 op: CigarOp::Match,
493 cnt: 2,
494 },
495 ],
496 };
497 assert_eq!(c.to_string(), "1I2=");
498 }
499
500 #[test]
501 fn from_path() {
502 let c = Cigar::from_path(
503 b"aaa",
504 b"aabc",
505 &vec![Pos(0, 0), Pos(1, 1), Pos(2, 2), Pos(3, 3), Pos(3, 4)],
506 );
507 assert_eq!(c.to_string(), "2=1X1I");
508 }
509
510 #[test]
511 fn from_string_with_count() {
512 let c = Cigar::from_string("24=");
513 assert_eq!(c.ops.len(), 1);
514 assert_eq!(c.ops[0], CigarElem::new(CigarOp::Match, 24));
515 }
516
517 #[test]
518 fn from_string_mixed_ops() {
519 let c = Cigar::from_string("2=3I1X");
520 assert_eq!(
521 c.ops,
522 vec![
523 CigarElem::new(CigarOp::Match, 2),
524 CigarElem::new(CigarOp::Ins, 3),
525 CigarElem::new(CigarOp::Sub, 1)
526 ]
527 );
528 }
529
530 #[test]
531 fn from_string_no_counts() {
532 let c = Cigar::from_string("=XIDDD");
533 assert_eq!(
534 c.ops,
535 vec![
536 CigarElem::new(CigarOp::Match, 1),
537 CigarElem::new(CigarOp::Sub, 1),
538 CigarElem::new(CigarOp::Ins, 1),
539 CigarElem::new(CigarOp::Del, 3),
540 ]
541 );
542 }
543
544 #[test]
545 #[rustfmt::skip]
546 fn push_to_path() {
547 let mut c = Cigar::default();
548 c.push(CigarOp::Match); c.push(CigarOp::Del); c.push(CigarOp::Ins); c.push(CigarOp::Sub); assert_eq!(
555 c.to_path(),
556 [
557 Pos(0, 0),
558 Pos(1, 1),
559 Pos(2, 1),
560 Pos(2, 2),
561 Pos(3, 3),
562 ]
563 );
564 }
565
566 #[test]
567 fn to_char_pairs_all_match() {
568 let c = Cigar::from_string("3=");
569 let pairs = c.to_char_pairs(b"aaa", b"aaa");
570 assert_eq!(
571 pairs,
572 vec![
573 CigarOpChars::Match(b'a'),
574 CigarOpChars::Match(b'a'),
575 CigarOpChars::Match(b'a'),
576 ]
577 );
578 }
579
580 #[test]
581 fn to_char_pairs_sub() {
582 let c = Cigar::from_string("1X");
583 let pairs = c.to_char_pairs(b"a", b"c");
584 assert_eq!(pairs, vec![CigarOpChars::Sub(b'a', b'c')]);
585 }
586
587 #[test]
588 fn to_char_pairs_ins() {
589 let c = Cigar::from_string("1=1I1=");
590 let pairs = c.to_char_pairs(b"ac", b"abc");
591 assert_eq!(
592 pairs,
593 vec![
594 CigarOpChars::Match(b'a'),
595 CigarOpChars::Ins(b'b'),
596 CigarOpChars::Match(b'c'),
597 ]
598 );
599 }
600
601 #[test]
602 fn to_char_pairs_del() {
603 let c = Cigar::from_string("1=1D1=");
604 let pairs = c.to_char_pairs(b"abc", b"ac");
605 assert_eq!(
606 pairs,
607 vec![
608 CigarOpChars::Match(b'a'),
609 CigarOpChars::Del(b'b'),
610 CigarOpChars::Match(b'c'),
611 ]
612 );
613 }
614
615 #[test]
616 fn to_char_pairs_mixed() {
617 let c = Cigar::from_string("2=1X1I");
618 let pairs = c.to_char_pairs(b"abZd", b"abYc");
619 assert_eq!(
620 pairs,
621 vec![
622 CigarOpChars::Match(b'a'),
623 CigarOpChars::Match(b'b'),
624 CigarOpChars::Sub(b'Z', b'Y'),
625 CigarOpChars::Ins(b'c'),
626 ]
627 );
628 }
629}