1use crate::caller::{
76 ConsensusCaller, ConsensusCallingStats, ConsensusOutput,
77 RejectionReason as CallerRejectionReason,
78};
79use crate::phred::{MIN_PHRED, NO_CALL_BASE, NO_CALL_BASE_LOWER, PhredScore};
80use crate::simple_umi::consensus_umis;
81use crate::vanilla_caller::{
82 VanillaConsensusRead, VanillaUmiConsensusCaller, VanillaUmiConsensusOptions,
83};
84use crate::{IndexedSourceRead, SourceRead, select_most_common_alignment_group};
85use anyhow::Result;
86use fgumi_dna::dna::reverse_complement;
87use fgumi_raw_bam::{self as bam_fields, UnmappedBamRecordBuilder, flags};
88use noodles::sam::alignment::record::data::field::Tag;
89use rand::SeedableRng;
90use rand::rngs::StdRng;
91use rand::seq::SliceRandom;
92use std::cmp::{max, min};
93use std::collections::HashMap;
94
95#[derive(Debug, Clone)]
97pub struct CodecConsensusOptions {
98 pub min_input_base_quality: PhredScore,
100
101 pub error_rate_pre_umi: PhredScore,
103
104 pub error_rate_post_umi: PhredScore,
106
107 pub min_reads_per_strand: usize,
109
110 pub max_reads_per_strand: Option<usize>,
112
113 pub min_duplex_length: usize,
115
116 pub single_strand_qual: Option<PhredScore>,
118
119 pub outer_bases_qual: Option<PhredScore>,
121
122 pub outer_bases_length: usize,
124
125 pub max_duplex_disagreements: usize,
127
128 pub max_duplex_disagreement_rate: f64,
130
131 pub cell_tag: Option<Tag>,
133
134 pub produce_per_base_tags: bool,
136
137 pub trim: bool,
139
140 pub min_consensus_base_quality: PhredScore,
142}
143
144impl Default for CodecConsensusOptions {
145 fn default() -> Self {
146 Self {
147 min_input_base_quality: 10,
148 error_rate_pre_umi: 45,
149 error_rate_post_umi: 40,
150 min_reads_per_strand: 1,
151 max_reads_per_strand: None,
152 min_duplex_length: 1,
153 single_strand_qual: None,
154 outer_bases_qual: None,
155 outer_bases_length: 5,
156 max_duplex_disagreements: usize::MAX,
157 max_duplex_disagreement_rate: 1.0,
158 cell_tag: None,
159 produce_per_base_tags: false,
160 trim: false,
161 min_consensus_base_quality: 0,
162 }
163 }
164}
165
166#[derive(Debug, Clone, Default)]
168pub struct CodecConsensusStats {
169 pub total_input_reads: u64,
171
172 pub consensus_reads_generated: u64,
174
175 pub reads_filtered: u64,
177
178 pub consensus_reads_rejected_hdd: u64,
180
181 pub consensus_bases_emitted: u64,
183
184 pub consensus_duplex_bases_emitted: u64,
186
187 pub duplex_disagreement_base_count: u64,
189
190 pub rejection_reasons: HashMap<CallerRejectionReason, usize>,
192}
193
194impl CodecConsensusStats {
195 #[must_use]
197 pub fn duplex_disagreement_rate(&self) -> f64 {
198 if self.consensus_duplex_bases_emitted > 0 {
199 #[expect(
200 clippy::cast_precision_loss,
201 reason = "acceptable precision loss for rate computation"
202 )]
203 {
204 self.duplex_disagreement_base_count as f64
205 / self.consensus_duplex_bases_emitted as f64
206 }
207 } else {
208 0.0
209 }
210 }
211}
212
213#[derive(Debug, Clone)]
219struct SingleStrandConsensus {
220 bases: Vec<u8>,
222 quals: Vec<PhredScore>,
224 depths: Vec<u16>,
226 errors: Vec<u16>,
228 raw_read_count: usize,
230 ref_start: usize,
232 ref_end: usize,
234 is_negative_strand: bool,
236}
237
238struct ClippedRecordInfo {
243 raw_idx: usize,
245 clip_amount: usize,
247 clip_from_start: bool,
249 clipped_seq_len: usize,
251 clipped_cigar: Vec<u32>,
253 adjusted_pos: usize,
255 flags: u16,
257}
258
259pub struct CodecConsensusCaller {
264 read_name_prefix: String,
266
267 read_group_id: String,
269
270 options: CodecConsensusOptions,
272
273 stats: CodecConsensusStats,
275
276 rng: StdRng,
278
279 consensus_counter: u64,
281
282 ss_caller: VanillaUmiConsensusCaller,
285
286 bam_builder: UnmappedBamRecordBuilder,
288
289 track_rejects: bool,
291
292 rejected_reads: Vec<Vec<u8>>,
294}
295
296#[expect(
297 clippy::similar_names,
298 clippy::cast_precision_loss,
299 clippy::cast_possible_truncation,
300 clippy::cast_possible_wrap,
301 clippy::cast_sign_loss,
302 reason = "consensus calling uses numeric casts for quality/position math and has domain-specific variable names"
303)]
304impl CodecConsensusCaller {
305 #[must_use]
312 pub fn new(
313 read_name_prefix: String,
314 read_group_id: String,
315 options: CodecConsensusOptions,
316 ) -> Self {
317 let rng = StdRng::seed_from_u64(42);
318
319 let ss_options = VanillaUmiConsensusOptions {
325 tag: "MI".to_string(),
326 error_rate_pre_umi: options.error_rate_pre_umi,
327 error_rate_post_umi: options.error_rate_post_umi,
328 min_input_base_quality: options.min_input_base_quality,
329 min_reads: 1, max_reads: None,
331 produce_per_base_tags: true,
332 seed: None,
333 trim: false,
334 min_consensus_base_quality: 0, cell_tag: None,
336 };
337
338 let ss_caller = VanillaUmiConsensusCaller::new(
339 "x".to_string(), read_group_id.clone(),
341 ss_options,
342 );
343
344 Self {
345 read_name_prefix,
346 read_group_id,
347 options,
348 stats: CodecConsensusStats::default(),
349 rng,
350 consensus_counter: 0,
351 ss_caller,
352 bam_builder: UnmappedBamRecordBuilder::new(),
353 track_rejects: false,
354 rejected_reads: Vec::new(),
355 }
356 }
357
358 #[must_use]
363 pub fn new_with_rejects_tracking(
364 read_name_prefix: String,
365 read_group_id: String,
366 options: CodecConsensusOptions,
367 track_rejects: bool,
368 ) -> Self {
369 let mut caller = Self::new(read_name_prefix, read_group_id, options);
370 caller.track_rejects = track_rejects;
371 caller
372 }
373
374 #[must_use]
376 pub fn statistics(&self) -> &CodecConsensusStats {
377 &self.stats
378 }
379
380 pub fn clear(&mut self) {
385 self.stats = CodecConsensusStats::default();
386 self.ss_caller.clear();
387 self.rejected_reads.clear();
388 }
389
390 #[must_use]
392 pub fn rejected_reads(&self) -> &[Vec<u8>] {
393 &self.rejected_reads
394 }
395
396 pub fn clear_rejected_reads(&mut self) {
398 self.rejected_reads.clear();
399 }
400
401 pub fn take_rejected_reads(&mut self) -> Vec<Vec<u8>> {
403 std::mem::take(&mut self.rejected_reads)
404 }
405
406 fn to_source_read_for_codec_raw(
412 raw: &[u8],
413 original_idx: usize,
414 clip_amount: usize,
415 clip_from_start: bool,
416 clipped_cigar: Option<&[u32]>,
417 ) -> SourceRead {
418 let mut bases = bam_fields::extract_sequence(raw);
419 let mut quals = bam_fields::quality_scores_slice(raw).to_vec();
420 let flg = bam_fields::flags(raw);
421
422 let clip_amount = clip_amount.min(bases.len());
425 if clip_amount > 0 {
426 if clip_from_start {
427 bases = bases[clip_amount..].to_vec();
428 quals = quals[clip_amount..].to_vec();
429 } else {
430 let new_len = bases.len() - clip_amount;
431 bases.truncate(new_len);
432 quals.truncate(new_len);
433 }
434 }
435
436 let mut simplified = if let Some(ops) = clipped_cigar {
438 bam_fields::simplify_cigar_from_raw(ops)
439 } else {
440 let original_ops = bam_fields::get_cigar_ops(raw);
441 let (clipped_ops, _) =
442 bam_fields::clip_cigar_ops_raw(&original_ops, clip_amount, clip_from_start);
443 bam_fields::simplify_cigar_from_raw(&clipped_ops)
444 };
445
446 let is_negative = flg & flags::REVERSE != 0;
447 if is_negative {
448 simplified.reverse();
449 bases = reverse_complement(&bases);
450 quals.reverse();
451 }
452
453 SourceRead { original_idx, bases, quals, simplified_cigar: simplified, flags: flg }
454 }
455
456 fn vanilla_to_single_strand(
462 vcr: VanillaConsensusRead,
463 is_negative_strand: bool,
464 raw_read_count: usize,
465 ) -> SingleStrandConsensus {
466 let len = vcr.bases.len();
467 SingleStrandConsensus {
468 bases: vcr.bases,
469 quals: vcr.quals,
470 depths: vcr.depths,
471 errors: vcr.errors,
472 raw_read_count,
473 ref_start: 0,
474 ref_end: len.saturating_sub(1),
475 is_negative_strand,
476 }
477 }
478
479 fn reverse_complement_ss(ss: &SingleStrandConsensus) -> SingleStrandConsensus {
488 SingleStrandConsensus {
489 bases: reverse_complement(&ss.bases),
490 quals: ss.quals.iter().copied().rev().collect(),
491 depths: ss.depths.iter().copied().rev().collect(),
494 errors: ss.errors.iter().copied().rev().collect(),
495 raw_read_count: ss.raw_read_count,
496 ref_start: ss.ref_start,
497 ref_end: ss.ref_end,
498 is_negative_strand: !ss.is_negative_strand,
499 }
500 }
501
502 #[expect(
508 clippy::too_many_lines,
509 reason = "consensus pipeline has many sequential steps that are clearest in one function"
510 )]
511 fn consensus_reads_raw(&mut self, records: &[Vec<u8>]) -> Result<ConsensusOutput> {
512 self.stats.total_input_reads += records.len() as u64;
513
514 if records.is_empty() {
515 return Ok(ConsensusOutput::default());
516 }
517
518 let umi: Option<String> = bam_fields::find_string_tag_in_record(&records[0], b"MI")
520 .map(|b| String::from_utf8_lossy(b).to_string());
521
522 let mut paired_indices: Vec<usize> = Vec::new();
524 let mut frag_count = 0usize;
525 for (i, raw) in records.iter().enumerate() {
526 let flg = bam_fields::flags(raw);
527 if flg & flags::PAIRED == 0 {
528 frag_count += 1;
529 continue;
530 }
531 if flg & (flags::SECONDARY | flags::SUPPLEMENTARY | flags::UNMAPPED) != 0 {
533 continue;
534 }
535 if !bam_fields::is_fr_pair_raw(raw) {
536 continue;
537 }
538 paired_indices.push(i);
539 }
540
541 if frag_count > 0 {
542 self.reject_records_count(frag_count, CallerRejectionReason::FragmentRead);
543 }
544
545 if paired_indices.is_empty() {
546 return Ok(ConsensusOutput::default());
547 }
548
549 let mut by_name: HashMap<&[u8], Vec<usize>> = HashMap::new();
551 let mut name_order: Vec<&[u8]> = Vec::new();
552 for &idx in &paired_indices {
553 let name = bam_fields::read_name(&records[idx]);
554 if !by_name.contains_key(name) {
555 name_order.push(name);
556 }
557 by_name.entry(name).or_default().push(idx);
558 }
559
560 let mut r1_infos: Vec<ClippedRecordInfo> = Vec::new();
561 let mut r2_infos: Vec<ClippedRecordInfo> = Vec::new();
562
563 for name in name_order {
564 let indices = by_name.get(name).expect("name from iteration");
565 if indices.len() != 2 {
566 continue;
567 }
568
569 let (i1, i2) = {
570 let flg0 = bam_fields::flags(&records[indices[0]]);
571 if flg0 & flags::FIRST_SEGMENT != 0 {
572 (indices[0], indices[1])
573 } else {
574 (indices[1], indices[0])
575 }
576 };
577
578 let clip_r1 = bam_fields::num_bases_extending_past_mate_raw(&records[i1]);
580 let clip_r2 = bam_fields::num_bases_extending_past_mate_raw(&records[i2]);
581
582 let r1_info = Self::build_clipped_info(&records[i1], i1, clip_r1);
584 let r2_info = Self::build_clipped_info(&records[i2], i2, clip_r2);
586
587 r1_infos.push(r1_info);
588 r2_infos.push(r2_info);
589 }
590
591 if r1_infos.is_empty() {
592 return Ok(ConsensusOutput::default());
593 }
594
595 if r1_infos.len() < self.options.min_reads_per_strand {
597 self.reject_records_count(
598 r1_infos.len() + r2_infos.len(),
599 CallerRejectionReason::InsufficientReads,
600 );
601 return Ok(ConsensusOutput::default());
602 }
603
604 if let Some(max_reads) = self.options.max_reads_per_strand {
606 if r1_infos.len() > max_reads {
607 let mut indices: Vec<usize> = (0..r1_infos.len()).collect();
608 indices.shuffle(&mut self.rng);
609 indices.truncate(max_reads);
610 indices.sort_unstable();
611
612 let new_r1: Vec<_> = indices
613 .iter()
614 .map(|&i| std::mem::replace(&mut r1_infos[i], Self::dummy_info()))
615 .collect();
616 let new_r2: Vec<_> = indices
617 .iter()
618 .map(|&i| std::mem::replace(&mut r2_infos[i], Self::dummy_info()))
619 .collect();
620 r1_infos = new_r1;
621 r2_infos = new_r2;
622 }
623 }
624
625 let r1_infos = self.filter_to_most_common_alignment_raw(r1_infos);
627 let r2_infos = self.filter_to_most_common_alignment_raw(r2_infos);
628
629 if r1_infos.is_empty() || r2_infos.is_empty() {
630 return Ok(ConsensusOutput::default());
631 }
632
633 if r1_infos.len() < self.options.min_reads_per_strand
634 || r2_infos.len() < self.options.min_reads_per_strand
635 {
636 self.reject_records_count(
637 r1_infos.len() + r2_infos.len(),
638 CallerRejectionReason::InsufficientReads,
639 );
640 return Ok(ConsensusOutput::default());
641 }
642
643 let longest_r1 = r1_infos
646 .iter()
647 .rev()
648 .max_by_key(|info| bam_fields::reference_length_from_cigar(&info.clipped_cigar))
649 .expect("non-empty");
650 let longest_r2 = r2_infos
651 .iter()
652 .rev()
653 .max_by_key(|info| bam_fields::reference_length_from_cigar(&info.clipped_cigar))
654 .expect("non-empty");
655
656 let r1_is_negative = longest_r1.flags & flags::REVERSE != 0;
657 let (longest_pos, longest_neg) =
658 if r1_is_negative { (longest_r2, longest_r1) } else { (longest_r1, longest_r2) };
659
660 let neg_start = longest_neg.adjusted_pos;
661 let pos_start = longest_pos.adjusted_pos;
662 let pos_ref_len =
663 bam_fields::reference_length_from_cigar(&longest_pos.clipped_cigar) as usize;
664 let pos_end = pos_start + pos_ref_len.saturating_sub(1);
665
666 let (overlap_start, overlap_end) = (neg_start, pos_end);
667 let duplex_length = overlap_end as i64 - overlap_start as i64 + 1;
668
669 if duplex_length < self.options.min_duplex_length as i64 {
670 self.reject_records_count(
671 r1_infos.len() + r2_infos.len(),
672 CallerRejectionReason::InsufficientOverlap,
673 );
674 return Ok(ConsensusOutput::default());
675 }
676
677 if !Self::check_overlap_phase_raw(longest_r1, longest_r2, overlap_start, overlap_end) {
679 self.reject_records_count(
680 r1_infos.len() + r2_infos.len(),
681 CallerRejectionReason::IndelErrorBetweenStrands,
682 );
683 return Ok(ConsensusOutput::default());
684 }
685
686 let r2_is_negative = longest_r2.flags & flags::REVERSE != 0;
687
688 let consensus_length =
690 Self::compute_consensus_length_raw(longest_pos, longest_neg, overlap_end);
691 let Some(consensus_length) = consensus_length else {
692 self.reject_records_count(
693 r1_infos.len() + r2_infos.len(),
694 CallerRejectionReason::IndelErrorBetweenStrands,
695 );
696 return Ok(ConsensusOutput::default());
697 };
698
699 let r1_is_neg_strand = r1_infos.first().is_some_and(|i| i.flags & flags::REVERSE != 0);
701 let r1_source_reads: Vec<SourceRead> = r1_infos
702 .iter()
703 .enumerate()
704 .map(|(idx, info)| {
705 Self::to_source_read_for_codec_raw(
706 &records[info.raw_idx],
707 idx,
708 info.clip_amount,
709 info.clip_from_start,
710 Some(&info.clipped_cigar),
711 )
712 })
713 .collect();
714
715 let umi_str = umi.as_deref().unwrap_or("");
716 let ss_r1 = match self.ss_caller.consensus_call(umi_str, r1_source_reads)? {
717 Some(vcr) => Self::vanilla_to_single_strand(vcr, r1_is_neg_strand, r1_infos.len()),
718 None => return Ok(ConsensusOutput::default()),
719 };
720
721 let r2_is_neg_strand = r2_infos.first().is_some_and(|i| i.flags & flags::REVERSE != 0);
722 let r2_source_reads: Vec<SourceRead> = r2_infos
723 .iter()
724 .enumerate()
725 .map(|(idx, info)| {
726 Self::to_source_read_for_codec_raw(
727 &records[info.raw_idx],
728 idx,
729 info.clip_amount,
730 info.clip_from_start,
731 Some(&info.clipped_cigar),
732 )
733 })
734 .collect();
735
736 let ss_r2 = match self.ss_caller.consensus_call(umi_str, r2_source_reads)? {
737 Some(vcr) => Self::vanilla_to_single_strand(vcr, r2_is_neg_strand, r2_infos.len()),
738 None => return Ok(ConsensusOutput::default()),
739 };
740
741 if consensus_length < ss_r1.bases.len() || consensus_length < ss_r2.bases.len() {
742 self.reject_records_count(
743 r1_infos.len() + r2_infos.len(),
744 CallerRejectionReason::IndelErrorBetweenStrands,
745 );
746 return Ok(ConsensusOutput::default());
747 }
748
749 let (ss_r1_oriented, ss_r2_oriented) = if r1_is_negative {
751 (Self::reverse_complement_ss(&ss_r1), ss_r2.clone())
752 } else {
753 (ss_r1.clone(), Self::reverse_complement_ss(&ss_r2))
754 };
755
756 let padded_r1 = Self::pad_consensus(&ss_r1_oriented, consensus_length, r1_is_negative);
757 let padded_r2 = Self::pad_consensus(&ss_r2_oriented, consensus_length, r2_is_negative);
758
759 let consensus = self.build_duplex_consensus_from_padded(&padded_r1, &padded_r2)?;
760 let consensus = self.mask_consensus_quals_query_based(consensus, &padded_r1, &padded_r2);
761 let consensus =
762 if r1_is_negative { Self::reverse_complement_ss(&consensus) } else { consensus };
763
764 let (ss_for_ac, ss_for_bc) = if r1_is_negative {
765 (Self::reverse_complement_ss(&padded_r1), Self::reverse_complement_ss(&padded_r2))
766 } else {
767 (padded_r1.clone(), padded_r2.clone())
768 };
769
770 let all_paired_raws: Vec<&[u8]> = r1_infos
774 .iter()
775 .chain(r2_infos.iter())
776 .map(|info| records[info.raw_idx].as_slice())
777 .collect();
778
779 let mut output = ConsensusOutput::default();
780 self.build_output_record_into(
781 &mut output,
782 &consensus,
783 &ss_for_ac,
784 &ss_for_bc,
785 umi.as_deref(),
786 &all_paired_raws,
787 records,
788 )?;
789
790 self.stats.consensus_reads_generated += 1;
791 Ok(output)
792 }
793
794 fn build_clipped_info(raw: &[u8], raw_idx: usize, clip_amount: usize) -> ClippedRecordInfo {
796 let flg = bam_fields::flags(raw);
797 let is_reverse = flg & flags::REVERSE != 0;
798 let clip_from_start = is_reverse; let original_ops = bam_fields::get_cigar_ops(raw);
801 let (clipped_cigar, ref_bases_consumed) =
802 bam_fields::clip_cigar_ops_raw(&original_ops, clip_amount, clip_from_start);
803
804 let original_seq_len = bam_fields::l_seq(raw) as usize;
805 let clipped_seq_len = original_seq_len.saturating_sub(clip_amount);
806
807 let pos_0based = bam_fields::pos(raw);
809 debug_assert!(
810 pos_0based >= 0,
811 "build_clipped_info called on unmapped record (pos={pos_0based})"
812 );
813 let adjusted_pos = if clip_from_start {
814 (pos_0based + 1) as usize + ref_bases_consumed
815 } else {
816 (pos_0based + 1) as usize
817 };
818
819 ClippedRecordInfo {
820 raw_idx,
821 clip_amount,
822 clip_from_start,
823 clipped_seq_len,
824 clipped_cigar,
825 adjusted_pos,
826 flags: flg,
827 }
828 }
829
830 fn dummy_info() -> ClippedRecordInfo {
832 ClippedRecordInfo {
833 raw_idx: 0,
834 clip_amount: 0,
835 clip_from_start: false,
836 clipped_seq_len: 0,
837 clipped_cigar: Vec::new(),
838 adjusted_pos: 0,
839 flags: 0,
840 }
841 }
842
843 fn filter_to_most_common_alignment_raw(
845 &mut self,
846 infos: Vec<ClippedRecordInfo>,
847 ) -> Vec<ClippedRecordInfo> {
848 if infos.len() < 2 {
849 return infos;
850 }
851
852 let mut indexed: Vec<IndexedSourceRead> = infos
853 .iter()
854 .enumerate()
855 .map(|(i, info)| {
856 let mut cigar = bam_fields::simplify_cigar_from_raw(&info.clipped_cigar);
857 if info.flags & flags::REVERSE != 0 {
858 cigar.reverse();
859 }
860 (i, info.clipped_seq_len, cigar)
861 })
862 .collect();
863
864 indexed.sort_by(|a, b| b.1.cmp(&a.1));
865
866 let best_indices = select_most_common_alignment_group(&indexed);
867
868 let rejected_count = infos.len() - best_indices.len();
869 if rejected_count > 0 {
870 *self
871 .stats
872 .rejection_reasons
873 .entry(CallerRejectionReason::MinorityAlignment)
874 .or_insert(0) += rejected_count;
875 self.stats.reads_filtered += rejected_count as u64;
876 }
877
878 let best_set: std::collections::HashSet<usize> = best_indices.into_iter().collect();
879 infos
880 .into_iter()
881 .enumerate()
882 .filter(|(i, _)| best_set.contains(i))
883 .map(|(_, info)| info)
884 .collect()
885 }
886
887 fn check_overlap_phase_raw(
889 r1: &ClippedRecordInfo,
890 r2: &ClippedRecordInfo,
891 overlap_start: usize,
892 overlap_end: usize,
893 ) -> bool {
894 let r1s = bam_fields::read_pos_at_ref_pos_raw(
895 &r1.clipped_cigar,
896 r1.adjusted_pos,
897 overlap_start,
898 true,
899 );
900 let r2s = bam_fields::read_pos_at_ref_pos_raw(
901 &r2.clipped_cigar,
902 r2.adjusted_pos,
903 overlap_start,
904 true,
905 );
906 let r1e = bam_fields::read_pos_at_ref_pos_raw(
907 &r1.clipped_cigar,
908 r1.adjusted_pos,
909 overlap_end,
910 true,
911 );
912 let r2e = bam_fields::read_pos_at_ref_pos_raw(
913 &r2.clipped_cigar,
914 r2.adjusted_pos,
915 overlap_end,
916 true,
917 );
918
919 match (r1s, r2s, r1e, r2e) {
920 (Some(a), Some(b), Some(c), Some(d)) => (a as i64 - b as i64) == (c as i64 - d as i64),
921 _ => false,
922 }
923 }
924
925 fn compute_consensus_length_raw(
927 pos_info: &ClippedRecordInfo,
928 neg_info: &ClippedRecordInfo,
929 overlap_end: usize,
930 ) -> Option<usize> {
931 let pos_read_pos = bam_fields::read_pos_at_ref_pos_raw(
932 &pos_info.clipped_cigar,
933 pos_info.adjusted_pos,
934 overlap_end,
935 false,
936 )?;
937 let neg_read_pos = bam_fields::read_pos_at_ref_pos_raw(
938 &neg_info.clipped_cigar,
939 neg_info.adjusted_pos,
940 overlap_end,
941 false,
942 )?;
943
944 Some(pos_read_pos + neg_info.clipped_seq_len - neg_read_pos)
945 }
946
947 fn pad_consensus(
958 ss: &SingleStrandConsensus,
959 new_length: usize,
960 pad_left: bool,
961 ) -> SingleStrandConsensus {
962 let current_len = ss.bases.len();
963 if new_length <= current_len {
964 return ss.clone();
965 }
966
967 let pad_len = new_length - current_len;
968 let pad_bases = vec![NO_CALL_BASE_LOWER; pad_len]; let pad_quals = vec![0u8; pad_len];
970 let pad_depths = vec![0u16; pad_len];
971 let pad_errors = vec![0u16; pad_len];
972
973 let (bases, quals, depths, errors) = if pad_left {
974 (
975 [pad_bases, ss.bases.clone()].concat(),
976 [pad_quals, ss.quals.clone()].concat(),
977 [pad_depths, ss.depths.clone()].concat(),
978 [pad_errors, ss.errors.clone()].concat(),
979 )
980 } else {
981 (
982 [ss.bases.clone(), pad_bases].concat(),
983 [ss.quals.clone(), pad_quals].concat(),
984 [ss.depths.clone(), pad_depths].concat(),
985 [ss.errors.clone(), pad_errors].concat(),
986 )
987 };
988
989 SingleStrandConsensus {
990 bases,
991 quals,
992 depths,
993 errors,
994 raw_read_count: ss.raw_read_count,
995 ref_start: ss.ref_start,
996 ref_end: ss.ref_end,
997 is_negative_strand: ss.is_negative_strand,
998 }
999 }
1000
1001 fn build_duplex_consensus_from_padded(
1007 &mut self,
1008 ss_a: &SingleStrandConsensus,
1009 ss_b: &SingleStrandConsensus,
1010 ) -> Result<SingleStrandConsensus> {
1011 let len = ss_a.bases.len();
1012 assert_eq!(ss_b.bases.len(), len, "Padded consensuses must be the same length");
1013
1014 let mut bases = vec![NO_CALL_BASE; len];
1015 let mut quals: Vec<PhredScore> = vec![MIN_PHRED; len];
1016 let mut depths = vec![0u16; len];
1017 let mut errors = vec![0u16; len];
1018
1019 let mut duplex_disagreements = 0usize;
1020 let mut duplex_bases_count = 0usize;
1021
1022 for pos_idx in 0..len {
1023 let ba = ss_a.bases[pos_idx];
1024 let qa = ss_a.quals[pos_idx];
1025 let da = ss_a.depths[pos_idx];
1026 let ea = ss_a.errors[pos_idx];
1027
1028 let bb = ss_b.bases[pos_idx];
1029 let qb = ss_b.quals[pos_idx];
1030 let db = ss_b.depths[pos_idx];
1031 let eb = ss_b.errors[pos_idx];
1032
1033 let a_has_data = ba != NO_CALL_BASE && ba != NO_CALL_BASE_LOWER;
1037 let b_has_data = bb != NO_CALL_BASE && bb != NO_CALL_BASE_LOWER;
1038
1039 let (duplex_base, duplex_qual, depth, error) = match (a_has_data, b_has_data) {
1040 (true, true) => {
1041 duplex_bases_count += 1;
1043 let (raw_base, raw_qual) = if ba == bb {
1044 (ba, min(93, u16::from(qa) + u16::from(qb)) as u8)
1046 } else if qa > qb {
1047 duplex_disagreements += 1;
1049 (ba, max(MIN_PHRED, qa.saturating_sub(qb)))
1051 } else if qb > qa {
1052 duplex_disagreements += 1;
1053 (bb, max(MIN_PHRED, qb.saturating_sub(qa)))
1055 } else {
1056 duplex_disagreements += 1;
1061 (ba, MIN_PHRED) };
1063
1064 let (final_base, final_qual) = if raw_qual == MIN_PHRED {
1066 (NO_CALL_BASE, MIN_PHRED)
1067 } else {
1068 (raw_base, raw_qual)
1069 };
1070
1071 let duplex_error = if ba == bb {
1076 ea + eb
1078 } else if ba == raw_base {
1079 ea + db.saturating_sub(eb)
1082 } else {
1083 eb + da.saturating_sub(ea)
1085 };
1086
1087 (final_base, final_qual, da + db, duplex_error)
1088 }
1089 (true, false) => {
1090 if qa == MIN_PHRED {
1093 (NO_CALL_BASE, MIN_PHRED, da, ea)
1094 } else {
1095 (ba, qa, da, ea)
1096 }
1097 }
1098 (false, true) => {
1099 if qb == MIN_PHRED {
1102 (NO_CALL_BASE, MIN_PHRED, db, eb)
1103 } else {
1104 (bb, qb, db, eb)
1105 }
1106 }
1107 (false, false) => {
1108 let duplex_error = ea + eb;
1112 (NO_CALL_BASE, MIN_PHRED, 0, duplex_error)
1113 }
1114 };
1115
1116 let (duplex_base, duplex_qual) = if ba == NO_CALL_BASE || bb == NO_CALL_BASE {
1120 (NO_CALL_BASE, MIN_PHRED)
1121 } else {
1122 (duplex_base, duplex_qual)
1123 };
1124
1125 bases[pos_idx] = duplex_base;
1126 quals[pos_idx] = duplex_qual;
1127 depths[pos_idx] = depth;
1128 errors[pos_idx] = error;
1129 }
1130
1131 if duplex_bases_count > 0 {
1133 let duplex_error_rate = duplex_disagreements as f64 / duplex_bases_count as f64;
1134 self.stats.consensus_duplex_bases_emitted += duplex_bases_count as u64;
1135 self.stats.duplex_disagreement_base_count += duplex_disagreements as u64;
1136
1137 if duplex_disagreements > self.options.max_duplex_disagreements {
1138 anyhow::bail!("High duplex disagreement: {duplex_disagreements} disagreements");
1139 }
1140 if duplex_error_rate > self.options.max_duplex_disagreement_rate {
1141 anyhow::bail!("High duplex disagreement rate: {duplex_error_rate:.4}");
1142 }
1143 }
1144
1145 Ok(SingleStrandConsensus {
1146 bases,
1147 quals,
1148 depths,
1149 errors,
1150 raw_read_count: ss_a.raw_read_count + ss_b.raw_read_count,
1151 ref_start: 0,
1152 ref_end: len.saturating_sub(1),
1153 is_negative_strand: ss_a.is_negative_strand,
1154 })
1155 }
1156
1157 fn mask_consensus_quals_query_based(
1161 &self,
1162 mut consensus: SingleStrandConsensus,
1163 padded_r1: &SingleStrandConsensus,
1164 padded_r2: &SingleStrandConsensus,
1165 ) -> SingleStrandConsensus {
1166 let len = consensus.bases.len();
1167
1168 for idx in 0..len {
1169 let a_is_n = padded_r1.bases.get(idx).copied().unwrap_or(NO_CALL_BASE) == NO_CALL_BASE;
1173 let b_is_n = padded_r2.bases.get(idx).copied().unwrap_or(NO_CALL_BASE) == NO_CALL_BASE;
1174
1175 if (a_is_n || b_is_n) && consensus.bases[idx] != NO_CALL_BASE {
1176 if let Some(ss_qual) = self.options.single_strand_qual {
1178 consensus.quals[idx] = ss_qual;
1179 }
1180 }
1181
1182 if let Some(outer_qual) = self.options.outer_bases_qual {
1184 let outer_len = self.options.outer_bases_length;
1185 if idx < outer_len || idx >= len.saturating_sub(outer_len) {
1186 consensus.quals[idx] = min(consensus.quals[idx], outer_qual);
1187 }
1188 }
1189 }
1190
1191 consensus
1192 }
1193
1194 #[expect(
1196 clippy::unnecessary_wraps,
1197 reason = "Result return type kept for API consistency with other callers"
1198 )]
1199 #[expect(
1200 clippy::too_many_arguments,
1201 reason = "all_records needed separately from source_raws for UMI consensus"
1202 )]
1203 fn build_output_record_into(
1204 &mut self,
1205 output: &mut ConsensusOutput,
1206 consensus: &SingleStrandConsensus,
1207 ss_a: &SingleStrandConsensus,
1208 ss_b: &SingleStrandConsensus,
1209 umi: Option<&str>,
1210 source_raws: &[&[u8]],
1211 all_records: &[Vec<u8>],
1212 ) -> Result<()> {
1213 self.consensus_counter += 1;
1215 let read_name = if let Some(umi_str) = umi {
1216 format!("{}:{}", self.read_name_prefix, umi_str)
1217 } else {
1218 format!("{}:{}", self.read_name_prefix, self.consensus_counter)
1219 };
1220
1221 let flag = flags::UNMAPPED;
1223
1224 self.bam_builder.build_record(
1225 read_name.as_bytes(),
1226 flag,
1227 &consensus.bases,
1228 &consensus.quals,
1229 );
1230
1231 self.bam_builder.append_string_tag(b"RG", self.read_group_id.as_bytes());
1233
1234 if let Some(umi_str) = umi {
1236 self.bam_builder.append_string_tag(b"MI", umi_str.as_bytes());
1237 }
1238
1239 let total_depths: Vec<i32> = ss_a
1243 .depths
1244 .iter()
1245 .zip(ss_b.depths.iter())
1246 .map(|(&a, &b)| i32::from(a) + i32::from(b))
1247 .collect();
1248 let total_depth = total_depths.iter().copied().max().unwrap_or(0);
1249 let min_depth = total_depths.iter().copied().min().unwrap_or(0);
1250 let total_errors: usize = consensus.errors.iter().map(|&e| e as usize).sum();
1251 let total_bases: i32 = total_depths.iter().sum();
1252 let error_rate =
1253 if total_bases > 0 { total_errors as f32 / total_bases as f32 } else { 0.0 };
1254
1255 self.bam_builder.append_int_tag(b"cD", total_depth);
1256 self.bam_builder.append_int_tag(b"cM", min_depth);
1257 self.bam_builder.append_float_tag(b"cE", error_rate);
1258
1259 let a_max_depth = ss_a.depths.iter().map(|&d| i32::from(d)).max().unwrap_or(0);
1261 let a_min_depth = ss_a.depths.iter().map(|&d| i32::from(d)).min().unwrap_or(0);
1262 let a_total_errors: usize = ss_a.errors.iter().map(|&e| e as usize).sum();
1263 let a_total_bases: usize = ss_a.depths.iter().map(|&d| d as usize).sum();
1264 let a_error_rate =
1265 if a_total_bases > 0 { a_total_errors as f32 / a_total_bases as f32 } else { 0.0 };
1266
1267 self.bam_builder.append_int_tag(b"aD", a_max_depth);
1268 self.bam_builder.append_int_tag(b"aM", a_min_depth);
1269 self.bam_builder.append_float_tag(b"aE", a_error_rate);
1270
1271 let b_max_depth = ss_b.depths.iter().map(|&d| i32::from(d)).max().unwrap_or(0);
1273 let b_min_depth = ss_b.depths.iter().map(|&d| i32::from(d)).min().unwrap_or(0);
1274 let b_total_errors: usize = ss_b.errors.iter().map(|&e| e as usize).sum();
1275 let b_total_bases: usize = ss_b.depths.iter().map(|&d| d as usize).sum();
1276 let b_error_rate =
1277 if b_total_bases > 0 { b_total_errors as f32 / b_total_bases as f32 } else { 0.0 };
1278
1279 self.bam_builder.append_int_tag(b"bD", b_max_depth);
1280 self.bam_builder.append_int_tag(b"bM", b_min_depth);
1281 self.bam_builder.append_float_tag(b"bE", b_error_rate);
1282
1283 if self.options.produce_per_base_tags {
1285 let ad_array: Vec<i16> = ss_a.depths.iter().map(|&d| d as i16).collect();
1287 let bd_array: Vec<i16> = ss_b.depths.iter().map(|&d| d as i16).collect();
1288 self.bam_builder.append_i16_array_tag(b"ad", &ad_array);
1289 self.bam_builder.append_i16_array_tag(b"bd", &bd_array);
1290
1291 let ae_array: Vec<i16> = ss_a.errors.iter().map(|&e| e as i16).collect();
1293 let be_array: Vec<i16> = ss_b.errors.iter().map(|&e| e as i16).collect();
1294 self.bam_builder.append_i16_array_tag(b"ae", &ae_array);
1295 self.bam_builder.append_i16_array_tag(b"be", &be_array);
1296
1297 self.bam_builder.append_string_tag(b"ac", &ss_a.bases);
1299 self.bam_builder.append_string_tag(b"bc", &ss_b.bases);
1300
1301 self.bam_builder.append_phred33_string_tag(b"aq", &ss_a.quals);
1303 self.bam_builder.append_phred33_string_tag(b"bq", &ss_b.quals);
1304 }
1305
1306 if let Some(cell_tag) = &self.options.cell_tag {
1308 let cell_tag_bytes: [u8; 2] = [cell_tag.as_ref()[0], cell_tag.as_ref()[1]];
1309 for raw in source_raws {
1310 if let Some(cell_bc) = bam_fields::find_string_tag_in_record(raw, &cell_tag_bytes) {
1311 if !cell_bc.is_empty() {
1312 self.bam_builder.append_string_tag(&cell_tag_bytes, cell_bc);
1313 break;
1314 }
1315 }
1316 }
1317 }
1318
1319 let umis: Vec<String> = all_records
1324 .iter()
1325 .filter_map(|raw| {
1326 bam_fields::find_string_tag_in_record(raw, b"RX")
1327 .and_then(|b| String::from_utf8(b.to_vec()).ok())
1328 })
1329 .collect();
1330
1331 if !umis.is_empty() {
1332 let consensus_umi = consensus_umis(&umis);
1333 if !consensus_umi.is_empty() {
1334 self.bam_builder.append_string_tag(b"RX", consensus_umi.as_bytes());
1335 }
1336 }
1337
1338 self.bam_builder.write_with_block_size(&mut output.data);
1340 output.count += 1;
1341
1342 Ok(())
1343 }
1344
1345 fn reject_records_count(&mut self, count: usize, reason: CallerRejectionReason) {
1347 *self.stats.rejection_reasons.entry(reason).or_insert(0) += count;
1348 self.stats.reads_filtered += count as u64;
1349 }
1350}
1351
1352impl ConsensusCaller for CodecConsensusCaller {
1357 fn consensus_reads(&mut self, records: Vec<Vec<u8>>) -> Result<ConsensusOutput> {
1358 let result = self.consensus_reads_raw(&records)?;
1359 if self.track_rejects && result.count == 0 && !records.is_empty() {
1361 self.rejected_reads.extend(records);
1362 }
1363 Ok(result)
1364 }
1365
1366 #[expect(
1367 clippy::cast_possible_truncation,
1368 reason = "read counts will not exceed usize on any supported platform"
1369 )]
1370 fn total_reads(&self) -> usize {
1371 self.stats.total_input_reads as usize
1372 }
1373
1374 #[expect(
1375 clippy::cast_possible_truncation,
1376 reason = "read counts will not exceed usize on any supported platform"
1377 )]
1378 fn total_filtered(&self) -> usize {
1379 self.stats.reads_filtered as usize
1380 }
1381
1382 #[expect(
1383 clippy::cast_possible_truncation,
1384 reason = "read counts will not exceed usize on any supported platform"
1385 )]
1386 fn consensus_reads_constructed(&self) -> usize {
1387 self.stats.consensus_reads_generated as usize
1388 }
1389
1390 #[expect(
1391 clippy::cast_possible_truncation,
1392 reason = "read counts will not exceed usize on any supported platform"
1393 )]
1394 fn statistics(&self) -> ConsensusCallingStats {
1395 ConsensusCallingStats {
1396 total_reads: self.stats.total_input_reads as usize,
1397 consensus_reads: self.stats.consensus_reads_generated as usize,
1398 filtered_reads: self.stats.reads_filtered as usize,
1399 rejection_reasons: self.stats.rejection_reasons.clone(),
1400 }
1401 }
1402
1403 fn log_statistics(&self) {
1404 let stats = &self.stats;
1405 log::info!("CODEC Consensus Calling Statistics:");
1406 log::info!(" Total input reads: {}", stats.total_input_reads);
1407 log::info!(" Consensus reads generated: {}", stats.consensus_reads_generated);
1408 log::info!(" Reads filtered: {}", stats.reads_filtered);
1409 log::info!(" Consensus bases emitted: {}", stats.consensus_bases_emitted);
1410 log::info!(" Duplex bases emitted: {}", stats.consensus_duplex_bases_emitted);
1411 log::info!(" Duplex disagreement rate: {:.4}%", stats.duplex_disagreement_rate() * 100.0);
1412 if !stats.rejection_reasons.is_empty() {
1413 log::info!(" Rejection reasons:");
1414 for (reason, count) in &stats.rejection_reasons {
1415 log::info!(" {reason:?}: {count}");
1416 }
1417 }
1418 }
1419}
1420
1421impl CodecConsensusCaller {
1424 fn record_buf_to_raw(rec: &noodles::sam::alignment::RecordBuf) -> Vec<u8> {
1426 use crate::vendored::bam_codec::encode_record_buf;
1427 use noodles::sam::header::record::value::Map;
1428 use noodles::sam::header::record::value::map::ReferenceSequence;
1429 use std::num::NonZeroUsize;
1430
1431 let mut header = noodles::sam::Header::default();
1432 for name in &["chr1", "chr2", "chr3"] {
1434 let rs = Map::<ReferenceSequence>::new(NonZeroUsize::new(1000).unwrap());
1435 header.reference_sequences_mut().insert(bstr::BString::from(*name), rs);
1436 }
1437
1438 let mut buf = Vec::new();
1439 encode_record_buf(&mut buf, &header, rec).expect("encode_record_buf failed");
1440 buf
1441 }
1442
1443 #[expect(
1449 clippy::needless_pass_by_value,
1450 reason = "matches ConsensusCaller trait signature pattern"
1451 )]
1452 pub fn consensus_reads_from_sam_records(
1453 &mut self,
1454 recs: Vec<noodles::sam::alignment::RecordBuf>,
1455 ) -> Result<ConsensusOutput> {
1456 let raw_records: Vec<Vec<u8>> = recs.iter().map(Self::record_buf_to_raw).collect();
1457 self.consensus_reads(raw_records)
1458 }
1459}
1460
1461#[cfg(test)]
1462#[expect(
1463 clippy::must_use_candidate,
1464 clippy::needless_pass_by_value,
1465 clippy::cast_sign_loss,
1466 reason = "test-only methods use owned values and numeric casts"
1467)]
1468impl CodecConsensusCaller {
1469 pub fn is_fr_pair(&self, rec: &noodles::sam::alignment::RecordBuf) -> bool {
1471 let raw = Self::record_buf_to_raw(rec);
1472 bam_fields::is_fr_pair_raw(&raw)
1473 }
1474
1475 pub fn filter_to_most_common_alignment(
1477 &mut self,
1478 recs: Vec<noodles::sam::alignment::RecordBuf>,
1479 ) -> Vec<noodles::sam::alignment::RecordBuf> {
1480 let raws: Vec<Vec<u8>> = recs.iter().map(Self::record_buf_to_raw).collect();
1481 let infos: Vec<ClippedRecordInfo> =
1482 raws.iter().enumerate().map(|(i, raw)| Self::build_clipped_info(raw, i, 0)).collect();
1483 let filtered = self.filter_to_most_common_alignment_raw(infos);
1484 filtered.into_iter().map(|info| recs[info.raw_idx].clone()).collect()
1485 }
1486
1487 pub fn read_pos_at_ref_pos(
1489 rec: &noodles::sam::alignment::RecordBuf,
1490 ref_pos: usize,
1491 return_last_base_if_deleted: bool,
1492 ) -> Option<usize> {
1493 let raw = Self::record_buf_to_raw(rec);
1494 let cigar_ops = bam_fields::get_cigar_ops(&raw);
1495 let alignment_start = (bam_fields::pos(&raw) + 1) as usize; bam_fields::read_pos_at_ref_pos_raw(
1497 &cigar_ops,
1498 alignment_start,
1499 ref_pos,
1500 return_last_base_if_deleted,
1501 )
1502 }
1503
1504 pub fn check_overlap_phase(
1506 &self,
1507 r1: &noodles::sam::alignment::RecordBuf,
1508 r2: &noodles::sam::alignment::RecordBuf,
1509 overlap_start: usize,
1510 overlap_end: usize,
1511 ) -> bool {
1512 let raw1 = Self::record_buf_to_raw(r1);
1513 let raw2 = Self::record_buf_to_raw(r2);
1514 let info1 = Self::build_clipped_info(&raw1, 0, 0);
1515 let info2 = Self::build_clipped_info(&raw2, 1, 0);
1516 Self::check_overlap_phase_raw(&info1, &info2, overlap_start, overlap_end)
1517 }
1518
1519 pub(crate) fn to_source_read_for_codec(
1521 rec: &noodles::sam::alignment::RecordBuf,
1522 original_idx: usize,
1523 ) -> SourceRead {
1524 let raw = Self::record_buf_to_raw(rec);
1525 Self::to_source_read_for_codec_raw(&raw, original_idx, 0, false, None)
1526 }
1527
1528 pub fn compute_codec_consensus_length(
1530 pos_rec: &noodles::sam::alignment::RecordBuf,
1531 neg_rec: &noodles::sam::alignment::RecordBuf,
1532 overlap_end: usize,
1533 ) -> Option<usize> {
1534 let raw_pos = Self::record_buf_to_raw(pos_rec);
1535 let raw_neg = Self::record_buf_to_raw(neg_rec);
1536 let info_pos = Self::build_clipped_info(&raw_pos, 0, 0);
1537 let info_neg = Self::build_clipped_info(&raw_neg, 1, 0);
1538 Self::compute_consensus_length_raw(&info_pos, &info_neg, overlap_end)
1539 }
1540
1541 pub fn downsample_pairs(
1543 &mut self,
1544 r1s: Vec<noodles::sam::alignment::RecordBuf>,
1545 r2s: Vec<noodles::sam::alignment::RecordBuf>,
1546 ) -> (Vec<noodles::sam::alignment::RecordBuf>, Vec<noodles::sam::alignment::RecordBuf>) {
1547 let Some(max_reads) = self.options.max_reads_per_strand else {
1548 return (r1s, r2s);
1549 };
1550 if r1s.len() <= max_reads {
1551 return (r1s, r2s);
1552 }
1553 let mut indices: Vec<usize> = (0..r1s.len()).collect();
1554 indices.shuffle(&mut self.rng);
1555 indices.truncate(max_reads);
1556 indices.sort_unstable();
1557 let new_r1: Vec<_> = indices.iter().map(|&i| r1s[i].clone()).collect();
1558 let new_r2: Vec<_> = indices.iter().map(|&i| r2s[i].clone()).collect();
1559 (new_r1, new_r2)
1560 }
1561}
1562
1563#[cfg(test)]
1564#[expect(
1565 clippy::similar_names,
1566 reason = "test variables use domain-specific names like r1/r2, ad/bd"
1567)]
1568mod tests {
1569 use super::*;
1570 use fgumi_raw_bam::ParsedBamRecord;
1571 use fgumi_sam::builder::RecordBuilder;
1572 use fgumi_sam::clipper::cigar_utils;
1573 use noodles::sam::alignment::RecordBuf;
1574 use noodles::sam::alignment::record::Cigar as CigarTrait;
1575 use noodles::sam::alignment::record::Flags;
1576 use noodles::sam::alignment::record::cigar::op::Kind;
1577
1578 #[test]
1579 fn test_codec_caller_creation() {
1580 let options = CodecConsensusOptions::default();
1581 let caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
1582
1583 assert_eq!(caller.read_name_prefix, "codec");
1584 assert_eq!(caller.read_group_id, "RG1");
1585 }
1586
1587 #[test]
1588 fn test_codec_options_defaults() {
1589 let options = CodecConsensusOptions::default();
1590 assert_eq!(options.min_reads_per_strand, 1);
1591 assert_eq!(options.min_duplex_length, 1);
1592 assert_eq!(options.min_input_base_quality, 10);
1593 assert_eq!(options.error_rate_pre_umi, 45);
1594 assert_eq!(options.error_rate_post_umi, 40);
1595 }
1596
1597 #[test]
1598 fn test_empty_input() {
1599 let options = CodecConsensusOptions::default();
1600 let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
1601
1602 let output = caller.consensus_reads_from_sam_records(Vec::new()).unwrap();
1603 assert_eq!(output.count, 0);
1604 assert_eq!(caller.stats.total_input_reads, 0);
1605 }
1606
1607 #[test]
1608 fn test_reverse_complement() {
1609 assert_eq!(reverse_complement(b"ACGT"), b"ACGT".to_vec());
1610 assert_eq!(reverse_complement(b"AAAA"), b"TTTT".to_vec());
1611 assert_eq!(reverse_complement(b"ACGTN"), b"NACGT".to_vec());
1612 assert_eq!(reverse_complement(b""), b"".to_vec());
1613 }
1614
1615 #[test]
1616 fn test_stats_duplex_disagreement_rate() {
1617 let mut stats = CodecConsensusStats::default();
1618 assert!(stats.duplex_disagreement_rate().abs() < f64::EPSILON);
1619
1620 stats.consensus_duplex_bases_emitted = 100;
1621 stats.duplex_disagreement_base_count = 5;
1622 assert!((stats.duplex_disagreement_rate() - 0.05).abs() < 1e-6);
1623 }
1624
1625 #[expect(
1627 clippy::too_many_arguments,
1628 clippy::cast_possible_truncation,
1629 clippy::cast_possible_wrap,
1630 clippy::cast_sign_loss,
1631 reason = "test helper needs all parameters and uses casts for BAM position math"
1632 )]
1633 fn create_test_paired_read(
1634 name: &str,
1635 seq: &[u8],
1636 qual: &[u8],
1637 is_first: bool,
1638 is_reverse: bool,
1639 mate_is_reverse: bool,
1640 ref_start: usize,
1641 cigar_ops: &[(Kind, usize)],
1642 ) -> RecordBuf {
1643 use std::fmt::Write;
1645 let mut ref_len = 0usize;
1646 let cigar_str: String = cigar_ops.iter().fold(String::new(), |mut acc, (kind, len)| {
1647 match kind {
1649 Kind::Match
1650 | Kind::SequenceMatch
1651 | Kind::SequenceMismatch
1652 | Kind::Deletion
1653 | Kind::Skip => {
1654 ref_len += len;
1655 }
1656 _ => {}
1657 }
1658 let c = match kind {
1659 Kind::Match => 'M',
1660 Kind::Insertion => 'I',
1661 Kind::Deletion => 'D',
1662 Kind::SoftClip => 'S',
1663 Kind::HardClip => 'H',
1664 Kind::Skip => 'N',
1665 Kind::Pad => 'P',
1666 Kind::SequenceMatch => '=',
1667 Kind::SequenceMismatch => 'X',
1668 };
1669 let _ = write!(acc, "{len}{c}");
1670 acc
1671 });
1672
1673 let insert_size: i32 = 200;
1676 let (mate_start, tlen) = if is_reverse {
1677 let mate_pos = (ref_start as i32 - insert_size + ref_len as i32).max(1) as usize;
1679 (mate_pos, -insert_size)
1680 } else {
1681 let mate_pos = ((ref_start as i32) + insert_size - (ref_len as i32)).max(1) as usize;
1683 (mate_pos, insert_size)
1684 };
1685
1686 let seq_str = String::from_utf8_lossy(seq);
1687 RecordBuilder::new()
1688 .name(name)
1689 .sequence(&seq_str)
1690 .qualities(qual)
1691 .reference_sequence_id(0)
1692 .alignment_start(ref_start)
1693 .cigar(&cigar_str)
1694 .first_segment(is_first)
1695 .properly_paired(true)
1696 .reverse_complement(is_reverse)
1697 .mate_reverse_complement(mate_is_reverse)
1698 .mate_reference_sequence_id(0)
1699 .mate_alignment_start(mate_start)
1700 .template_length(tlen)
1701 .tag("MI", "UMI123")
1702 .build()
1703 }
1704
1705 #[test]
1706 fn test_is_fr_pair() {
1707 use noodles::sam::alignment::record::cigar::op::Kind;
1708
1709 let options = CodecConsensusOptions::default();
1710 let caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
1711
1712 let r1_fr = create_test_paired_read(
1714 "read1",
1715 b"ACGT",
1716 b"####",
1717 true,
1718 false,
1719 true,
1720 100,
1721 &[(Kind::Match, 4)],
1722 );
1723 assert!(caller.is_fr_pair(&r1_fr));
1724
1725 let r1_rf = create_test_paired_read(
1727 "read1",
1728 b"ACGT",
1729 b"####",
1730 true,
1731 true,
1732 false,
1733 100,
1734 &[(Kind::Match, 4)],
1735 );
1736 assert!(caller.is_fr_pair(&r1_rf));
1737
1738 let r1_ff = create_test_paired_read(
1740 "read1",
1741 b"ACGT",
1742 b"####",
1743 true,
1744 false,
1745 false,
1746 100,
1747 &[(Kind::Match, 4)],
1748 );
1749 assert!(!caller.is_fr_pair(&r1_ff));
1750 }
1751
1752 #[test]
1753 fn test_simplify_cigar() {
1754 use noodles::sam::alignment::record::cigar::op::Kind;
1755
1756 let read = create_test_paired_read(
1758 "read",
1759 b"ACGTACGT",
1760 b"########",
1761 true,
1762 false,
1763 true,
1764 100,
1765 &[(Kind::Match, 8)],
1766 );
1767 assert_eq!(cigar_utils::simplify_cigar(read.cigar()), vec![(Kind::Match, 8)]);
1768
1769 let read_indel = create_test_paired_read(
1771 "read",
1772 b"ACGTACGT",
1773 b"########",
1774 true,
1775 false,
1776 true,
1777 100,
1778 &[(Kind::Match, 4), (Kind::Deletion, 2), (Kind::Match, 4)],
1779 );
1780 assert_eq!(
1781 cigar_utils::simplify_cigar(read_indel.cigar()),
1782 vec![(Kind::Match, 4), (Kind::Deletion, 2), (Kind::Match, 4)]
1783 );
1784
1785 let read_softclip = create_test_paired_read(
1787 "read",
1788 b"ACGTACGT",
1789 b"########",
1790 true,
1791 false,
1792 true,
1793 100,
1794 &[(Kind::SoftClip, 2), (Kind::Match, 6)],
1795 );
1796 assert_eq!(
1797 cigar_utils::simplify_cigar(read_softclip.cigar()),
1798 vec![(Kind::Match, 8)] );
1800 }
1801
1802 #[test]
1803 fn test_filter_to_most_common_alignment() {
1804 use noodles::sam::alignment::record::cigar::op::Kind;
1805
1806 let options = CodecConsensusOptions::default();
1807 let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
1808
1809 let reads = vec![
1811 create_test_paired_read(
1812 "r1",
1813 b"ACGT",
1814 b"####",
1815 true,
1816 false,
1817 true,
1818 100,
1819 &[(Kind::Match, 4)],
1820 ),
1821 create_test_paired_read(
1822 "r2",
1823 b"ACGT",
1824 b"####",
1825 true,
1826 false,
1827 true,
1828 100,
1829 &[(Kind::Match, 4)],
1830 ),
1831 create_test_paired_read(
1832 "r3",
1833 b"ACGT",
1834 b"####",
1835 true,
1836 false,
1837 true,
1838 100,
1839 &[(Kind::Match, 4)],
1840 ),
1841 create_test_paired_read(
1842 "r4",
1843 b"ACGT",
1844 b"####",
1845 true,
1846 false,
1847 true,
1848 100,
1849 &[(Kind::Match, 2), (Kind::Deletion, 1), (Kind::Match, 2)],
1850 ),
1851 ];
1852
1853 let filtered = caller.filter_to_most_common_alignment(reads);
1854 assert_eq!(filtered.len(), 3);
1855 assert_eq!(
1856 caller.stats.rejection_reasons.get(&CallerRejectionReason::MinorityAlignment),
1857 Some(&1)
1858 );
1859 }
1860
1861 #[test]
1866 fn test_filter_to_most_common_alignment_tie_breaking() {
1867 use noodles::sam::alignment::record::cigar::op::Kind;
1868
1869 let options = CodecConsensusOptions::default();
1870 let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
1871
1872 let reads = vec![
1879 create_test_paired_read(
1880 "r1_4M",
1881 b"ACGT",
1882 b"####",
1883 true,
1884 false,
1885 true,
1886 100,
1887 &[(Kind::Match, 4)],
1888 ),
1889 create_test_paired_read(
1890 "r2_4M",
1891 b"ACGT",
1892 b"####",
1893 true,
1894 false,
1895 true,
1896 100,
1897 &[(Kind::Match, 4)],
1898 ),
1899 create_test_paired_read(
1900 "r3_del",
1901 b"ACGT",
1902 b"####",
1903 true,
1904 false,
1905 true,
1906 100,
1907 &[(Kind::Match, 3), (Kind::Deletion, 1), (Kind::Match, 1)],
1908 ),
1909 create_test_paired_read(
1910 "r4_del",
1911 b"ACGT",
1912 b"####",
1913 true,
1914 false,
1915 true,
1916 100,
1917 &[(Kind::Match, 3), (Kind::Deletion, 1), (Kind::Match, 1)],
1918 ),
1919 ];
1920
1921 let filtered = caller.filter_to_most_common_alignment(reads);
1922 assert_eq!(filtered.len(), 2);
1924 for read in &filtered {
1928 let cigar = read.cigar();
1929 assert_eq!(cigar.as_ref().len(), 3, "Expected 3-op cigar (3M1D1M)");
1930 let ops: Vec<_> = cigar.iter().map(|r| r.unwrap()).collect();
1931 assert_eq!(ops[0].kind(), Kind::Match);
1932 assert_eq!(ops[0].len(), 3);
1933 assert_eq!(ops[1].kind(), Kind::Deletion);
1934 assert_eq!(ops[1].len(), 1);
1935 assert_eq!(ops[2].kind(), Kind::Match);
1936 assert_eq!(ops[2].len(), 1);
1937 }
1938 assert_eq!(
1940 caller.stats.rejection_reasons.get(&CallerRejectionReason::MinorityAlignment),
1941 Some(&2)
1942 );
1943 }
1944
1945 #[test]
1959 fn test_filter_to_most_common_alignment_negative_strand_cigar_reversal() {
1960 use noodles::sam::alignment::record::cigar::op::Kind;
1961
1962 let options = CodecConsensusOptions::default();
1963 let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
1964
1965 let reads = vec![
1968 create_test_paired_read(
1970 "r1_groupA",
1971 b"ACGTAC",
1972 b"######",
1973 true,
1974 true, false,
1976 100,
1977 &[(Kind::Match, 3), (Kind::Insertion, 1), (Kind::Match, 2)],
1978 ),
1979 create_test_paired_read(
1980 "r2_groupA",
1981 b"ACGTAC",
1982 b"######",
1983 true,
1984 true, false,
1986 100,
1987 &[(Kind::Match, 3), (Kind::Insertion, 1), (Kind::Match, 2)],
1988 ),
1989 create_test_paired_read(
1991 "r3_groupB",
1992 b"ACGTAC",
1993 b"######",
1994 true,
1995 true, false,
1997 100,
1998 &[(Kind::Match, 2), (Kind::Insertion, 1), (Kind::Match, 3)],
1999 ),
2000 create_test_paired_read(
2001 "r4_groupB",
2002 b"ACGTAC",
2003 b"######",
2004 true,
2005 true, false,
2007 100,
2008 &[(Kind::Match, 2), (Kind::Insertion, 1), (Kind::Match, 3)],
2009 ),
2010 ];
2011
2012 let filtered = caller.filter_to_most_common_alignment(reads);
2013
2014 assert_eq!(filtered.len(), 2);
2016
2017 for read in &filtered {
2023 let cigar = read.cigar();
2024 let ops: Vec<_> = cigar.iter().map(|r| r.unwrap()).collect();
2025 assert_eq!(ops.len(), 3, "Expected 3-op cigar");
2026 assert_eq!(ops[0].kind(), Kind::Match);
2028 assert_eq!(ops[0].len(), 3, "Expected first element to be 3M (Group A won)");
2029 assert_eq!(ops[1].kind(), Kind::Insertion);
2030 assert_eq!(ops[1].len(), 1);
2031 assert_eq!(ops[2].kind(), Kind::Match);
2032 assert_eq!(ops[2].len(), 2);
2033 }
2034
2035 assert_eq!(
2037 caller.stats.rejection_reasons.get(&CallerRejectionReason::MinorityAlignment),
2038 Some(&2)
2039 );
2040 }
2041
2042 const REF_BASES: &[u8] = b"TGGGTGTTGTTTGGGTTCCTGCTTAAAGAGCTACTGTTCTTCACAGAAACTTCCAACTCACCCAGACTGAGATTTGTACTGAGACTACGATCCACATGTTCAATATCTGATATCTGATGGGAAATAGGCTTTACTGAATTATCCATTTGGGCTGTAATTAATTTCAGTGATGAGCGGGAGATGTTGTTAGTTGTGCTCAGTAACTTTTTGATAGTAGCGGGAGTAGGAGTAAATCTTGTACTAATTAGTGAATATTCTGTTGATGGTGGCTGAAAATTTATAGCTACACAACCAAAAAAATAAAAAACGTTAGTCAATAGCATTTATAAATAGTCTTCTCTACCTGAAATATTTTACATTAAGTAATTCATTCCTTCATTTAGTATCTACACATGTCTAACATTGTAGTAGGAGCTGTGTACTAACAAGAAATCATGACACTGTTTCTGCCTTCAAGGAGCTTATAATCTTTTGGGGTACACAAGATAACCCAGAATGTTAAATAGTATAAAAGTCAAAGTACAATAATTTATTTCATTAAGATTTTGAAATGGCTAACAAACACCTGTTGATCACCTCATACACATGAGCCTCAAAACAAAGGAAAGCACAGCCCCTATGCCTGAGCAATTTAGAATATTGTCAAGGATAGAGACATGTGAGCCATTCACTATGAAACAATCATTGAGAACTACTACAAGAGTGATAAATATAAAATGAAACCTACAGAAACACAGAAGAGTAAGTAATTTTCCCTATAAAGAAGACAGGAACTAAATGTATAAGCAAAAATTGGGAAATTATATAAATGCTATTTTATATGAGAGGCAAAGAACCACAGGTCTAATAATTTTACAAATGTGATAAAATCAGATTTTATGTCCCCATCTTTCTTGACTGCTCAGCTAGAAATTAAAACATTTTTACACATCTTTTTGGCGGGGGCGGGGGGGATCATTATTTATTTCACCTGCCAAAATACTTCATTTCCTTATTGCACTTTTTTACTTCTTTGGTATGGAAAAATCTAACGGGTTTTAGAGTATGAACACATTTTAAGCAGTGATTAGATACGTTTTTCTTGTTATGCTTTCTATTGCAAATTTAGGATTTGATTTTGCACTGTCTTCATGCAAAGCTCTTCTCAAAGGTCTTAAAATATAAAAAACACTTAATGCTTCTCAAAGCATTAAGATTTTATGTAAATCAAACCAAAACCAGAAAAAGACAGAAGAAAATGAACCAAAAACAACAAAAATAATCCTTAACATAGTTGGCAACAAGTGCAATGAAAGATTTTT";
2048
2049 #[expect(
2056 clippy::too_many_arguments,
2057 clippy::too_many_lines,
2058 clippy::cast_possible_truncation,
2059 clippy::cast_possible_wrap,
2060 reason = "test helper needs all parameters, many lines, and uses casts for BAM position math"
2061 )]
2062 fn create_fr_pair(
2063 name: &str,
2064 start1: usize,
2065 start2: usize,
2066 _read_length: usize,
2067 base_quality: u8,
2068 cigar1: &[(Kind, usize)],
2069 cigar2: &[(Kind, usize)],
2070 mi_tag: &str,
2071 rx_tag: Option<&str>,
2072 strand1_reverse: bool,
2073 strand2_reverse: bool,
2074 ) -> Vec<RecordBuf> {
2075 let calc_ref_length = |cigar: &[(Kind, usize)]| -> usize {
2077 cigar
2078 .iter()
2079 .filter(|(k, _)| matches!(k, Kind::Match | Kind::Deletion | Kind::Skip))
2080 .map(|(_, len)| *len)
2081 .sum()
2082 };
2083
2084 let ref_len1 = calc_ref_length(cigar1);
2085 let ref_len2 = calc_ref_length(cigar2);
2086
2087 let get_sequence = |start: usize, cigar: &[(Kind, usize)]| -> Vec<u8> {
2089 let mut seq = Vec::new();
2090 let mut ref_pos = start - 1; for &(kind, len) in cigar {
2092 match kind {
2093 Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => {
2094 let end = (ref_pos + len).min(REF_BASES.len());
2096 if ref_pos < REF_BASES.len() {
2097 seq.extend_from_slice(&REF_BASES[ref_pos..end]);
2098 let pad_len = (ref_pos + len).saturating_sub(end);
2100 seq.resize(seq.len() + pad_len, b'A');
2101 }
2102 ref_pos += len;
2103 }
2104 Kind::Insertion | Kind::SoftClip => {
2105 seq.resize(seq.len() + len, b'A');
2107 }
2108 Kind::Deletion | Kind::Skip => {
2109 ref_pos += len;
2111 }
2112 Kind::HardClip | Kind::Pad => {
2113 }
2115 }
2116 }
2117 seq
2118 };
2119
2120 let seq1_fwd = get_sequence(start1, cigar1);
2121 let seq2_fwd = get_sequence(start2, cigar2);
2122
2123 let seq1: Vec<u8> = if strand1_reverse { reverse_complement(&seq1_fwd) } else { seq1_fwd };
2129 let seq2: Vec<u8> = if strand2_reverse { reverse_complement(&seq2_fwd) } else { seq2_fwd };
2130
2131 let qual1 = vec![base_quality; seq1.len()];
2132 let qual2 = vec![base_quality; seq2.len()];
2133
2134 let cigar_to_str = |cigar: &[(Kind, usize)]| -> String {
2136 use std::fmt::Write;
2137 cigar.iter().fold(String::new(), |mut acc, (kind, len)| {
2138 let c = match kind {
2139 Kind::Match => 'M',
2140 Kind::Insertion => 'I',
2141 Kind::Deletion => 'D',
2142 Kind::SoftClip => 'S',
2143 Kind::HardClip => 'H',
2144 Kind::Skip => 'N',
2145 Kind::Pad => 'P',
2146 Kind::SequenceMatch => '=',
2147 Kind::SequenceMismatch => 'X',
2148 };
2149 let _ = write!(acc, "{len}{c}");
2150 acc
2151 })
2152 };
2153
2154 let cigar1_str = cigar_to_str(cigar1);
2155 let cigar2_str = cigar_to_str(cigar2);
2156
2157 let template_len = if start1 <= start2 {
2160 (start2 + ref_len2 - start1) as i32
2161 } else {
2162 -((start1 + ref_len1 - start2) as i32)
2163 };
2164
2165 let seq1_str = String::from_utf8_lossy(&seq1);
2167 let mut r1_builder = RecordBuilder::new()
2168 .name(name)
2169 .sequence(&seq1_str)
2170 .qualities(&qual1)
2171 .reference_sequence_id(0)
2172 .alignment_start(start1)
2173 .cigar(&cigar1_str)
2174 .first_segment(true)
2175 .properly_paired(true)
2176 .reverse_complement(strand1_reverse)
2177 .mate_reverse_complement(strand2_reverse)
2178 .mate_reference_sequence_id(0)
2179 .mate_alignment_start(start2)
2180 .template_length(template_len)
2181 .tag("MI", mi_tag);
2182
2183 if let Some(rx) = rx_tag {
2184 r1_builder = r1_builder.tag("RX", rx);
2185 }
2186 let r1 = r1_builder.build();
2187
2188 let seq2_str = String::from_utf8_lossy(&seq2);
2190 let mut r2_builder = RecordBuilder::new()
2191 .name(name)
2192 .sequence(&seq2_str)
2193 .qualities(&qual2)
2194 .reference_sequence_id(0)
2195 .alignment_start(start2)
2196 .cigar(&cigar2_str)
2197 .first_segment(false)
2198 .properly_paired(true)
2199 .reverse_complement(strand2_reverse)
2200 .mate_reverse_complement(strand1_reverse)
2201 .mate_reference_sequence_id(0)
2202 .mate_alignment_start(start1)
2203 .template_length(-template_len)
2204 .tag("MI", mi_tag);
2205
2206 if let Some(rx) = rx_tag {
2207 r2_builder = r2_builder.tag("RX", rx);
2208 }
2209 let r2 = r2_builder.build();
2210
2211 vec![r1, r2]
2212 }
2213
2214 #[test]
2220 fn test_make_consensus_from_simple_reads() {
2221 let options = CodecConsensusOptions {
2222 min_reads_per_strand: 1,
2223 min_duplex_length: 1,
2224 ..Default::default()
2225 };
2226 let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2227
2228 let reads = create_fr_pair(
2230 "read1",
2231 1,
2232 11,
2233 30,
2234 35,
2235 &[(Kind::Match, 30)],
2236 &[(Kind::Match, 30)],
2237 "hi",
2238 Some("ACC-TGA"),
2239 false, true, );
2242
2243 let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2244
2245 assert_eq!(output.count, 1, "Should produce one consensus read");
2246
2247 let records = ParsedBamRecord::parse_all(&output.data);
2248 let consensus = &records[0];
2249 let name = String::from_utf8_lossy(&consensus.name);
2251 assert!(name.contains("hi"), "Consensus name should contain MI tag: {name}");
2252
2253 assert_eq!(consensus.bases.len(), 40, "Consensus should be 40bp");
2255
2256 let rx = consensus.get_string_tag(b"RX").expect("RX tag not found in consensus");
2258 assert_eq!(&rx[..], b"ACC-TGA", "RX tag should be preserved");
2259 }
2260
2261 #[test]
2263 fn test_make_consensus_r1_deletion() {
2264 let options = CodecConsensusOptions {
2265 min_reads_per_strand: 1,
2266 min_duplex_length: 1,
2267 ..Default::default()
2268 };
2269 let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2270
2271 let reads = create_fr_pair(
2273 "read1",
2274 1,
2275 13,
2276 30,
2277 35,
2278 &[(Kind::Match, 5), (Kind::Deletion, 2), (Kind::Match, 25)],
2279 &[(Kind::Match, 30)],
2280 "hi",
2281 Some("ACC-TGA"),
2282 false,
2283 true,
2284 );
2285
2286 let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2287
2288 assert_eq!(output.count, 1, "Should produce one consensus read");
2290
2291 let records = ParsedBamRecord::parse_all(&output.data);
2293 let consensus = &records[0];
2294 assert!(!consensus.bases.is_empty(), "Should have sequence");
2295 }
2296
2297 #[test]
2299 fn test_not_emit_consensus_for_rf_pair() {
2300 let options = CodecConsensusOptions {
2301 min_reads_per_strand: 1,
2302 min_duplex_length: 1,
2303 ..Default::default()
2304 };
2305 let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2306
2307 let reads = create_fr_pair(
2309 "read1",
2310 100,
2311 135,
2312 30,
2313 35,
2314 &[(Kind::Match, 30)],
2315 &[(Kind::Match, 30)],
2316 "hi",
2317 Some("ACC-TGA"),
2318 true, false, );
2321
2322 let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2323
2324 assert_eq!(output.count, 0, "Should not emit consensus for RF pair");
2325 }
2326
2327 #[test]
2329 fn test_not_emit_consensus_insufficient_reads() {
2330 let options2 = CodecConsensusOptions {
2332 min_reads_per_strand: 2,
2333 min_duplex_length: 1,
2334 ..Default::default()
2335 };
2336 let mut caller2 =
2337 CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options2);
2338
2339 let reads = create_fr_pair(
2340 "read1",
2341 1,
2342 11,
2343 30,
2344 35,
2345 &[(Kind::Match, 30)],
2346 &[(Kind::Match, 30)],
2347 "hi",
2348 Some("ACC-TGA"),
2349 false,
2350 true,
2351 );
2352
2353 let output = caller2.consensus_reads_from_sam_records(reads).unwrap();
2354
2355 assert_eq!(output.count, 0, "Should not emit consensus with insufficient reads");
2356
2357 let options1 = CodecConsensusOptions {
2359 min_reads_per_strand: 1,
2360 min_duplex_length: 1,
2361 ..Default::default()
2362 };
2363 let mut caller1 =
2364 CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options1);
2365
2366 let reads = create_fr_pair(
2367 "read1",
2368 1,
2369 11,
2370 30,
2371 35,
2372 &[(Kind::Match, 30)],
2373 &[(Kind::Match, 30)],
2374 "hi",
2375 Some("ACC-TGA"),
2376 false,
2377 true,
2378 );
2379
2380 let output = caller1.consensus_reads_from_sam_records(reads).unwrap();
2381
2382 assert_eq!(output.count, 1, "Should emit consensus with minReadsPerStrand=1");
2383 }
2384
2385 #[test]
2387 fn test_not_emit_consensus_insufficient_overlap() {
2388 let options20 = CodecConsensusOptions {
2391 min_reads_per_strand: 1,
2392 min_duplex_length: 20,
2393 ..Default::default()
2394 };
2395 let mut caller20 =
2396 CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options20);
2397
2398 let reads = create_fr_pair(
2399 "read1",
2400 1,
2401 11,
2402 30,
2403 35,
2404 &[(Kind::Match, 30)],
2405 &[(Kind::Match, 30)],
2406 "hi",
2407 Some("ACC-TGA"),
2408 false,
2409 true,
2410 );
2411
2412 let output = caller20.consensus_reads_from_sam_records(reads).unwrap();
2413 assert_eq!(
2414 output.count, 1,
2415 "Should emit consensus with minDuplexLength=20 and 20bp overlap"
2416 );
2417
2418 let options21 = CodecConsensusOptions {
2420 min_reads_per_strand: 1,
2421 min_duplex_length: 21,
2422 ..Default::default()
2423 };
2424 let mut caller21 =
2425 CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options21);
2426
2427 let reads = create_fr_pair(
2428 "read1",
2429 1,
2430 11,
2431 30,
2432 35,
2433 &[(Kind::Match, 30)],
2434 &[(Kind::Match, 30)],
2435 "hi",
2436 Some("ACC-TGA"),
2437 false,
2438 true,
2439 );
2440
2441 let output = caller21.consensus_reads_from_sam_records(reads).unwrap();
2442 assert_eq!(
2443 output.count, 0,
2444 "Should not emit consensus with minDuplexLength=21 and 20bp overlap"
2445 );
2446 }
2447
2448 #[test]
2450 fn test_not_emit_consensus_unmapped_mate() {
2451 let options = CodecConsensusOptions {
2452 min_reads_per_strand: 1,
2453 min_duplex_length: 1,
2454 ..Default::default()
2455 };
2456 let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2457
2458 let mut reads = create_fr_pair(
2460 "read1",
2461 1,
2462 11,
2463 30,
2464 35,
2465 &[(Kind::Match, 30)],
2466 &[(Kind::Match, 30)],
2467 "hi",
2468 Some("ACC-TGA"),
2469 false,
2470 true,
2471 );
2472
2473 let r2 = &mut reads[1];
2475 *r2.flags_mut() = r2.flags() | Flags::UNMAPPED;
2476
2477 let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2478
2479 assert_eq!(output.count, 0, "Should not emit consensus when mate is unmapped");
2480 }
2481
2482 #[test]
2489 fn test_emit_consensus_in_r1_orientation() {
2490 let options = CodecConsensusOptions {
2492 min_reads_per_strand: 1,
2493 min_duplex_length: 1,
2494 ..Default::default()
2495 };
2496 let mut caller_fwd =
2497 CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options.clone());
2498
2499 let reads_fwd = create_fr_pair(
2501 "read1",
2502 1,
2503 11,
2504 30,
2505 35,
2506 &[(Kind::Match, 30)],
2507 &[(Kind::Match, 30)],
2508 "hi",
2509 Some("ACC-TGA"),
2510 false, true, );
2513
2514 let output_fwd = caller_fwd.consensus_reads_from_sam_records(reads_fwd).unwrap();
2515 assert_eq!(output_fwd.count, 1, "Should produce one consensus read");
2516
2517 let records_fwd = ParsedBamRecord::parse_all(&output_fwd.data);
2518 assert_eq!(records_fwd[0].bases.len(), 40, "Forward R1 should produce 40bp consensus");
2520
2521 assert_eq!(
2523 records_fwd[0].flag & flags::REVERSE,
2524 0,
2525 "Forward R1 should produce forward-oriented consensus"
2526 );
2527 }
2528
2529 #[test]
2535 fn test_not_emit_consensus_high_disagreement() {
2536 let options_permissive = CodecConsensusOptions {
2538 min_reads_per_strand: 1,
2539 min_duplex_length: 1,
2540 max_duplex_disagreements: 100,
2541 max_duplex_disagreement_rate: 1.0,
2542 ..Default::default()
2543 };
2544 let mut caller_permissive =
2545 CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options_permissive);
2546
2547 let reads = create_fr_pair(
2549 "read1",
2550 1,
2551 11,
2552 30,
2553 35,
2554 &[(Kind::Match, 30)],
2555 &[(Kind::Match, 30)],
2556 "hi",
2557 Some("ACC-TGA"),
2558 false,
2559 true,
2560 );
2561
2562 let output = caller_permissive.consensus_reads_from_sam_records(reads).unwrap();
2563 assert_eq!(output.count, 1, "Should emit consensus with permissive settings");
2564
2565 let options_strict = CodecConsensusOptions {
2568 min_reads_per_strand: 1,
2569 min_duplex_length: 1,
2570 max_duplex_disagreements: 5, max_duplex_disagreement_rate: 0.05, ..Default::default()
2573 };
2574 let mut caller_strict =
2575 CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options_strict);
2576
2577 let reads2 = create_fr_pair(
2579 "read1",
2580 1,
2581 11,
2582 30,
2583 35,
2584 &[(Kind::Match, 30)],
2585 &[(Kind::Match, 30)],
2586 "hi",
2587 Some("ACC-TGA"),
2588 false,
2589 true,
2590 );
2591
2592 let cons2 = caller_strict.consensus_reads_from_sam_records(reads2);
2595 assert!(cons2.is_err(), "Should reject consensus with strict disagreement settings");
2596 }
2597
2598 #[test]
2602 fn test_make_consensus_r2_deletion() {
2603 let options = CodecConsensusOptions {
2604 min_reads_per_strand: 1,
2605 min_duplex_length: 1,
2606 ..Default::default()
2607 };
2608 let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2609
2610 let reads = create_fr_pair(
2614 "read1",
2615 1,
2616 11,
2617 30,
2618 35,
2619 &[(Kind::Match, 30)],
2620 &[(Kind::Match, 25), (Kind::Deletion, 5), (Kind::Match, 5)],
2621 "hi",
2622 Some("ACC-TGA"),
2623 false,
2624 true,
2625 );
2626
2627 let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2628
2629 assert_eq!(output.count, 1, "Should produce one consensus read");
2631
2632 let records = ParsedBamRecord::parse_all(&output.data);
2635 let consensus = &records[0];
2636 let len = consensus.bases.len();
2637 assert_eq!(len, 40, "Consensus should be 40bp (skipping the 5bp deletion), got {len}");
2638 }
2639
2640 #[test]
2642 fn test_make_consensus_soft_clipping() {
2643 let options = CodecConsensusOptions {
2644 min_reads_per_strand: 1,
2645 min_duplex_length: 1,
2646 ..Default::default()
2647 };
2648 let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2649
2650 let reads = create_fr_pair(
2658 "read1",
2659 1,
2660 11,
2661 30,
2662 35,
2663 &[(Kind::SoftClip, 5), (Kind::Match, 25)],
2664 &[(Kind::Match, 25), (Kind::SoftClip, 5)],
2665 "hi",
2666 Some("ACC-TGA"),
2667 false,
2668 true,
2669 );
2670
2671 let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2672
2673 assert_eq!(output.count, 1, "Should produce one consensus read");
2675
2676 let records = ParsedBamRecord::parse_all(&output.data);
2678 let consensus = &records[0];
2679 let len = consensus.bases.len();
2680 assert_eq!(
2683 len, 45,
2684 "Consensus should be 45bp (5 soft-clip + 35 ref + 5 soft-clip), got {len}"
2685 );
2686 }
2687
2688 #[test]
2690 fn test_make_consensus_both_soft_clipped_same_end() {
2691 let options = CodecConsensusOptions {
2692 min_reads_per_strand: 1,
2693 min_duplex_length: 1,
2694 ..Default::default()
2695 };
2696 let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2697
2698 let reads = create_fr_pair(
2702 "read1",
2703 1,
2704 1,
2705 30,
2706 35,
2707 &[(Kind::SoftClip, 5), (Kind::Match, 25)],
2708 &[(Kind::SoftClip, 5), (Kind::Match, 25)],
2709 "hi",
2710 Some("ACC-TGA"),
2711 false,
2712 true,
2713 );
2714
2715 let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2716
2717 assert_eq!(output.count, 1, "Should produce one consensus read");
2719
2720 let records = ParsedBamRecord::parse_all(&output.data);
2722 let consensus = &records[0];
2723 assert!(!consensus.bases.is_empty(), "Should have sequence with both reads soft-clipped");
2724 }
2725
2726 #[test]
2728 fn test_not_emit_consensus_chimeric_pair() {
2729 let options = CodecConsensusOptions {
2730 min_reads_per_strand: 1,
2731 min_duplex_length: 1,
2732 ..Default::default()
2733 };
2734 let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2735
2736 let mut reads = create_fr_pair(
2738 "read1",
2739 100,
2740 135,
2741 30,
2742 35,
2743 &[(Kind::Match, 30)],
2744 &[(Kind::Match, 30)],
2745 "hi",
2746 Some("ACC-TGA"),
2747 false,
2748 true,
2749 );
2750
2751 *reads[0].reference_sequence_id_mut() = Some(2); *reads[1].mate_reference_sequence_id_mut() = Some(2);
2756
2757 let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2758
2759 assert_eq!(
2761 output.count, 0,
2762 "Should not emit consensus for cross-chromosomal chimeric pair"
2763 );
2764 }
2765
2766 #[test]
2770 fn test_not_emit_consensus_r1_end_in_indel() {
2771 let options = CodecConsensusOptions {
2772 min_reads_per_strand: 1,
2773 min_duplex_length: 1,
2774 ..Default::default()
2775 };
2776 let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2777
2778 let reads = create_fr_pair(
2782 "read1",
2783 1,
2784 11,
2785 30,
2786 35,
2787 &[(Kind::Match, 30)],
2788 &[(Kind::Match, 19), (Kind::Deletion, 2), (Kind::Match, 11)],
2789 "hi",
2790 Some("ACC-TGA"),
2791 false,
2792 true,
2793 );
2794
2795 let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2796
2797 assert_eq!(
2800 output.count, 0,
2801 "Should not emit consensus when R1's end lands in an indel in R2"
2802 );
2803 }
2804
2805 #[test]
2807 fn test_mask_end_qualities() {
2808 let options = CodecConsensusOptions {
2811 min_reads_per_strand: 1,
2812 min_duplex_length: 1,
2813 outer_bases_length: 7,
2814 outer_bases_qual: Some(5),
2815 ..Default::default()
2816 };
2817 let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2818
2819 let reads = create_fr_pair(
2821 "read1",
2822 1,
2823 1,
2824 50,
2825 90, &[(Kind::Match, 50)],
2827 &[(Kind::Match, 50)],
2828 "hi",
2829 Some("ACC-TGA"),
2830 false,
2831 true,
2832 );
2833
2834 let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2835
2836 assert_eq!(output.count, 1, "Should produce one consensus read");
2837
2838 let records = ParsedBamRecord::parse_all(&output.data);
2839 let consensus = &records[0];
2840 let quals = &consensus.quals;
2841
2842 for (i, &q) in quals.iter().take(7).enumerate() {
2845 assert!(q <= 5, "First 7 bases should have qual <= 5, but base {i} has qual {q}");
2846 }
2847
2848 let len = quals.len();
2850 for (i, &q) in quals.iter().skip(len.saturating_sub(7)).enumerate() {
2851 assert!(q <= 5, "Last 7 bases should have qual <= 5, but base {i} has qual {q}");
2852 }
2853 }
2854
2855 #[test]
2857 fn test_mask_single_stranded_regions() {
2858 let options = CodecConsensusOptions {
2861 min_reads_per_strand: 1,
2862 min_duplex_length: 1,
2863 single_strand_qual: Some(4),
2864 ..Default::default()
2865 };
2866 let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2867
2868 let reads = create_fr_pair(
2873 "read1",
2874 1,
2875 20,
2876 30,
2877 90, &[(Kind::Match, 30)],
2879 &[(Kind::Match, 30)],
2880 "hi",
2881 Some("ACC-TGA"),
2882 false,
2883 true,
2884 );
2885
2886 let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2887
2888 assert_eq!(output.count, 1, "Should produce one consensus read");
2889
2890 let records = ParsedBamRecord::parse_all(&output.data);
2894 let consensus = &records[0];
2895 let quals = &consensus.quals;
2896
2897 let has_low_qual = quals.iter().any(|&q| q <= 4);
2901 assert!(
2902 has_low_qual || quals.is_empty(),
2903 "Should have some low-quality single-strand regions (or be filtered)"
2904 );
2905 }
2906
2907 #[test]
2911 fn test_nocall_masking_uppercase_n() {
2912 let options = CodecConsensusOptions::default();
2913 let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2914
2915 let ss_a = SingleStrandConsensus {
2922 bases: b"ANAnC".to_vec(),
2923 quals: vec![40, 2, 40, 0, 40], depths: vec![2, 0, 2, 0, 2],
2925 errors: vec![0, 0, 0, 0, 0],
2926 raw_read_count: 2,
2927 ref_start: 0,
2928 ref_end: 4,
2929 is_negative_strand: false,
2930 };
2931
2932 let ss_b = SingleStrandConsensus {
2933 bases: b"ATNGn".to_vec(),
2934 quals: vec![40, 40, 2, 40, 0], depths: vec![2, 2, 0, 2, 0],
2936 errors: vec![0, 0, 0, 0, 0],
2937 raw_read_count: 2,
2938 ref_start: 0,
2939 ref_end: 4,
2940 is_negative_strand: true,
2941 };
2942
2943 let duplex = caller
2944 .build_duplex_consensus_from_padded(&ss_a, &ss_b)
2945 .expect("Should build duplex consensus");
2946
2947 assert_eq!(duplex.bases[0], b'A', "Position 0: Both have A, should agree");
2949
2950 assert_eq!(
2952 duplex.bases[1], b'N',
2953 "Position 1: Strand A has uppercase N (NoCall), should mask to N"
2954 );
2955
2956 assert_eq!(
2958 duplex.bases[2], b'N',
2959 "Position 2: Strand B has uppercase N (NoCall), should mask to N"
2960 );
2961
2962 assert_eq!(
2964 duplex.bases[3], b'G',
2965 "Position 3: Strand A has lowercase n (padding), should use strand B's base G"
2966 );
2967
2968 assert_eq!(
2970 duplex.bases[4], b'C',
2971 "Position 4: Strand B has lowercase n (padding), should use strand A's base C"
2972 );
2973 }
2974
2975 #[test]
2976 fn test_to_source_read_for_codec_positive_strand() {
2977 use noodles::sam::alignment::record::cigar::op::Kind;
2978
2979 let read = create_test_paired_read(
2981 "read1",
2982 b"ACGT",
2983 &[30, 31, 32, 33],
2984 true,
2985 false, true,
2987 100,
2988 &[(Kind::Match, 4)],
2989 );
2990
2991 let source = CodecConsensusCaller::to_source_read_for_codec(&read, 0);
2992
2993 assert_eq!(source.bases, b"ACGT");
2995 assert_eq!(source.quals, vec![30, 31, 32, 33]);
2996 assert_eq!(source.original_idx, 0);
2997 }
2998
2999 #[test]
3000 fn test_to_source_read_for_codec_negative_strand() {
3001 use noodles::sam::alignment::record::cigar::op::Kind;
3002
3003 let read = create_test_paired_read(
3005 "read1",
3006 b"ACGT",
3007 &[30, 31, 32, 33],
3008 true,
3009 true, false,
3011 100,
3012 &[(Kind::Match, 4)],
3013 );
3014
3015 let source = CodecConsensusCaller::to_source_read_for_codec(&read, 1);
3016
3017 assert_eq!(source.bases, b"ACGT"); assert_eq!(source.quals, vec![33, 32, 31, 30]); assert_eq!(source.original_idx, 1);
3021 }
3022
3023 #[test]
3024 fn test_to_source_read_for_codec_negative_strand_non_palindrome() {
3025 use noodles::sam::alignment::record::cigar::op::Kind;
3026
3027 let read = create_test_paired_read(
3029 "read1",
3030 b"AACC", &[10, 20, 30, 40],
3032 true,
3033 true, false,
3035 100,
3036 &[(Kind::Match, 4)],
3037 );
3038
3039 let source = CodecConsensusCaller::to_source_read_for_codec(&read, 2);
3040
3041 assert_eq!(source.bases, b"GGTT");
3043 assert_eq!(source.quals, vec![40, 30, 20, 10]); assert_eq!(source.original_idx, 2);
3045 }
3046
3047 #[test]
3048 fn test_to_source_read_clip_exceeds_sequence_length() {
3049 use noodles::sam::alignment::record::cigar::op::Kind;
3050
3051 let read = create_test_paired_read(
3053 "read1",
3054 b"ACGT",
3055 &[30, 31, 32, 33],
3056 true,
3057 false, true,
3059 100,
3060 &[(Kind::Match, 4)],
3061 );
3062
3063 let raw = CodecConsensusCaller::record_buf_to_raw(&read);
3064
3065 let source = CodecConsensusCaller::to_source_read_for_codec_raw(&raw, 0, 10, false, None);
3067 assert!(source.bases.is_empty(), "Bases should be empty after over-clipping");
3068 assert!(source.quals.is_empty(), "Quals should be empty after over-clipping");
3069
3070 let source = CodecConsensusCaller::to_source_read_for_codec_raw(&raw, 0, 10, true, None);
3072 assert!(source.bases.is_empty(), "Bases should be empty after over-clipping from start");
3073 }
3074
3075 #[test]
3076 fn test_to_source_read_clip_exact_sequence_length() {
3077 use noodles::sam::alignment::record::cigar::op::Kind;
3078
3079 let read = create_test_paired_read(
3081 "read1",
3082 b"ACGT",
3083 &[30, 31, 32, 33],
3084 true,
3085 false,
3086 true,
3087 100,
3088 &[(Kind::Match, 4)],
3089 );
3090
3091 let raw = CodecConsensusCaller::record_buf_to_raw(&read);
3092 let source = CodecConsensusCaller::to_source_read_for_codec_raw(&raw, 0, 4, false, None);
3093 assert!(source.bases.is_empty());
3094 assert!(source.quals.is_empty());
3095 }
3096
3097 #[test]
3102 fn test_build_clipped_info_no_clip_forward() {
3103 use noodles::sam::alignment::record::cigar::op::Kind;
3104
3105 let read = create_test_paired_read(
3106 "read1",
3107 b"ACGTACGT",
3108 &[30; 8],
3109 true,
3110 false, true,
3112 100, &[(Kind::Match, 8)],
3114 );
3115 let raw = CodecConsensusCaller::record_buf_to_raw(&read);
3116 let info = CodecConsensusCaller::build_clipped_info(&raw, 0, 0);
3117
3118 assert_eq!(info.raw_idx, 0);
3119 assert_eq!(info.clip_amount, 0);
3120 assert!(!info.clip_from_start, "Forward strand should not clip from start");
3121 assert_eq!(info.clipped_seq_len, 8);
3122 assert_eq!(info.adjusted_pos, 100); }
3124
3125 #[test]
3126 fn test_build_clipped_info_clip_from_end_forward() {
3127 use noodles::sam::alignment::record::cigar::op::Kind;
3128
3129 let read = create_test_paired_read(
3130 "read1",
3131 b"ACGTACGT",
3132 &[30; 8],
3133 true,
3134 false, true,
3136 100,
3137 &[(Kind::Match, 8)],
3138 );
3139 let raw = CodecConsensusCaller::record_buf_to_raw(&read);
3140 let info = CodecConsensusCaller::build_clipped_info(&raw, 5, 3);
3141
3142 assert_eq!(info.raw_idx, 5);
3143 assert_eq!(info.clip_amount, 3);
3144 assert!(!info.clip_from_start, "Forward strand clips from end");
3145 assert_eq!(info.clipped_seq_len, 5); assert_eq!(info.adjusted_pos, 100); }
3148
3149 #[test]
3150 fn test_build_clipped_info_clip_from_start_reverse() {
3151 use noodles::sam::alignment::record::cigar::op::Kind;
3152
3153 let read = create_test_paired_read(
3154 "read1",
3155 b"ACGTACGT",
3156 &[30; 8],
3157 true,
3158 true, false,
3160 100,
3161 &[(Kind::Match, 8)],
3162 );
3163 let raw = CodecConsensusCaller::record_buf_to_raw(&read);
3164 let info = CodecConsensusCaller::build_clipped_info(&raw, 2, 3);
3165
3166 assert_eq!(info.raw_idx, 2);
3167 assert_eq!(info.clip_amount, 3);
3168 assert!(info.clip_from_start, "Reverse strand should clip from start");
3169 assert_eq!(info.clipped_seq_len, 5); assert!(info.adjusted_pos > 100);
3172 }
3173
3174 #[test]
3175 fn test_build_clipped_info_zero_clip_preserves_all() {
3176 use noodles::sam::alignment::record::cigar::op::Kind;
3177
3178 let read = create_test_paired_read(
3179 "read1",
3180 b"ACGT",
3181 &[30; 4],
3182 true,
3183 false,
3184 true,
3185 50,
3186 &[(Kind::Match, 4)],
3187 );
3188 let raw = CodecConsensusCaller::record_buf_to_raw(&read);
3189 let info = CodecConsensusCaller::build_clipped_info(&raw, 0, 0);
3190
3191 assert_eq!(info.clipped_seq_len, 4);
3192 assert_eq!(info.adjusted_pos, 50);
3193 assert_eq!(info.clipped_cigar.len(), 1); }
3195
3196 #[test]
3197 fn test_vanilla_to_single_strand_conversion() {
3198 let vcr = VanillaConsensusRead {
3199 id: "test".to_string(),
3200 bases: vec![b'A', b'C', b'G', b'T'],
3201 quals: vec![30, 31, 32, 33],
3202 depths: vec![5, 6, 7, 8],
3203 errors: vec![0, 1, 0, 1],
3204 source_reads: None,
3205 };
3206
3207 let ss = CodecConsensusCaller::vanilla_to_single_strand(vcr.clone(), false, 10);
3208
3209 assert_eq!(ss.bases, vcr.bases);
3210 assert_eq!(ss.quals, vcr.quals);
3211 assert_eq!(ss.depths, vcr.depths);
3212 assert_eq!(ss.errors, vcr.errors);
3213 assert_eq!(ss.raw_read_count, 10);
3214 assert!(!ss.is_negative_strand);
3215 assert_eq!(ss.ref_start, 0);
3216 assert_eq!(ss.ref_end, 3); }
3218
3219 #[test]
3220 fn test_vanilla_to_single_strand_negative_strand() {
3221 let vcr = VanillaConsensusRead {
3222 id: "test".to_string(),
3223 bases: vec![b'A', b'C'],
3224 quals: vec![30, 31],
3225 depths: vec![5, 6],
3226 errors: vec![0, 1],
3227 source_reads: None,
3228 };
3229
3230 let ss = CodecConsensusCaller::vanilla_to_single_strand(vcr, true, 5);
3231
3232 assert!(ss.is_negative_strand);
3233 assert_eq!(ss.raw_read_count, 5);
3234 }
3235
3236 #[test]
3237 fn test_codec_statistics_tracking() {
3238 let options = CodecConsensusOptions {
3240 min_reads_per_strand: 1,
3241 min_duplex_length: 1,
3242 ..Default::default()
3243 };
3244 let caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
3245
3246 let stats = caller.statistics();
3247
3248 assert_eq!(stats.total_input_reads, 0);
3250 assert_eq!(stats.consensus_reads_generated, 0);
3251 assert_eq!(stats.reads_filtered, 0);
3252 }
3253
3254 #[test]
3259 fn test_read_pos_at_ref_pos_simple_match() {
3260 use noodles::sam::alignment::record::cigar::op::Kind;
3261
3262 let read = create_test_paired_read(
3264 "read1",
3265 &[b'A'; 20],
3266 &[30; 20],
3267 true,
3268 false,
3269 true,
3270 100,
3271 &[(Kind::Match, 20)],
3272 );
3273
3274 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 100, false), Some(1));
3276
3277 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 105, false), Some(6));
3279
3280 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 119, false), Some(20));
3282
3283 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 99, false), None);
3285
3286 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 120, false), None);
3288 }
3289
3290 #[test]
3291 fn test_read_pos_at_ref_pos_with_insertion() {
3292 use noodles::sam::alignment::record::cigar::op::Kind;
3293
3294 let read = create_test_paired_read(
3296 "read1",
3297 &[b'A'; 25],
3298 &[30; 25],
3299 true,
3300 false,
3301 true,
3302 100,
3303 &[(Kind::Match, 10), (Kind::Insertion, 5), (Kind::Match, 10)],
3304 );
3305
3306 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 100, false), Some(1));
3308 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 109, false), Some(10));
3309
3310 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 110, false), Some(16));
3313 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 119, false), Some(25));
3314 }
3315
3316 #[test]
3317 fn test_read_pos_at_ref_pos_with_deletion() {
3318 use noodles::sam::alignment::record::cigar::op::Kind;
3319
3320 let read = create_test_paired_read(
3322 "read1",
3323 &[b'A'; 20],
3324 &[30; 20],
3325 true,
3326 false,
3327 true,
3328 100,
3329 &[(Kind::Match, 10), (Kind::Deletion, 5), (Kind::Match, 10)],
3330 );
3331
3332 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 100, false), Some(1));
3334 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 109, false), Some(10));
3335
3336 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 110, false), None);
3338 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 114, false), None);
3339
3340 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 110, true), Some(10));
3342 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 114, true), Some(10));
3343
3344 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 115, false), Some(11));
3346 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 124, false), Some(20));
3347 }
3348
3349 #[test]
3350 fn test_read_pos_at_ref_pos_with_soft_clip() {
3351 use noodles::sam::alignment::record::cigar::op::Kind;
3352
3353 let read = create_test_paired_read(
3356 "read1",
3357 &[b'A'; 20],
3358 &[30; 20],
3359 true,
3360 false,
3361 true,
3362 100,
3363 &[(Kind::SoftClip, 5), (Kind::Match, 15)],
3364 );
3365
3366 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 100, false), Some(6));
3368 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 114, false), Some(20));
3369 }
3370
3371 #[test]
3372 fn test_check_overlap_phase_compatible() {
3373 use noodles::sam::alignment::record::cigar::op::Kind;
3374
3375 let options = CodecConsensusOptions::default();
3376 let caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
3377
3378 let r1 = create_test_paired_read(
3380 "read1",
3381 &[b'A'; 30],
3382 &[30; 30],
3383 true,
3384 false,
3385 true,
3386 100,
3387 &[(Kind::Match, 30)],
3388 );
3389 let r2 = create_test_paired_read(
3390 "read2",
3391 &[b'A'; 30],
3392 &[30; 30],
3393 false,
3394 true,
3395 false,
3396 110,
3397 &[(Kind::Match, 30)],
3398 );
3399
3400 assert!(caller.check_overlap_phase(&r1, &r2, 110, 129));
3402 }
3403
3404 #[test]
3405 fn test_check_overlap_phase_indel_mismatch() {
3406 use noodles::sam::alignment::record::cigar::op::Kind;
3407
3408 let options = CodecConsensusOptions::default();
3409 let caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
3410
3411 let r1 = create_test_paired_read(
3414 "read1",
3415 &[b'A'; 30],
3416 &[30; 30],
3417 true,
3418 false,
3419 true,
3420 100,
3421 &[(Kind::Match, 30)],
3422 );
3423 let r2 = create_test_paired_read(
3424 "read2",
3425 &[b'A'; 30],
3426 &[30; 30],
3427 false,
3428 true,
3429 false,
3430 100,
3431 &[(Kind::Match, 15), (Kind::Deletion, 5), (Kind::Match, 15)],
3432 );
3433
3434 let _result = caller.check_overlap_phase(&r1, &r2, 100, 129);
3437 }
3438
3439 #[test]
3440 fn test_pad_consensus_right() {
3441 let ss = SingleStrandConsensus {
3443 bases: b"ACGT".to_vec(),
3444 quals: vec![30, 31, 32, 33],
3445 depths: vec![5, 6, 7, 8],
3446 errors: vec![0, 1, 0, 1],
3447 raw_read_count: 5,
3448 ref_start: 100,
3449 ref_end: 103,
3450 is_negative_strand: false,
3451 };
3452
3453 let padded = CodecConsensusCaller::pad_consensus(&ss, 8, false);
3454
3455 assert_eq!(padded.bases.len(), 8);
3456 assert_eq!(&padded.bases[0..4], b"ACGT");
3457 assert_eq!(&padded.bases[4..8], b"nnnn"); assert_eq!(padded.quals[4..8], vec![0, 0, 0, 0]);
3459 assert_eq!(padded.depths[4..8], vec![0, 0, 0, 0]);
3460 }
3461
3462 #[test]
3463 fn test_pad_consensus_left() {
3464 let ss = SingleStrandConsensus {
3466 bases: b"ACGT".to_vec(),
3467 quals: vec![30, 31, 32, 33],
3468 depths: vec![5, 6, 7, 8],
3469 errors: vec![0, 1, 0, 1],
3470 raw_read_count: 5,
3471 ref_start: 100,
3472 ref_end: 103,
3473 is_negative_strand: true,
3474 };
3475
3476 let padded = CodecConsensusCaller::pad_consensus(&ss, 8, true);
3477
3478 assert_eq!(padded.bases.len(), 8);
3479 assert_eq!(&padded.bases[0..4], b"nnnn"); assert_eq!(&padded.bases[4..8], b"ACGT");
3481 assert_eq!(padded.quals[0..4], vec![0, 0, 0, 0]);
3482 assert_eq!(padded.depths[0..4], vec![0, 0, 0, 0]);
3483 }
3484
3485 #[test]
3486 fn test_pad_consensus_no_padding_needed() {
3487 let ss = SingleStrandConsensus {
3489 bases: b"ACGT".to_vec(),
3490 quals: vec![30, 31, 32, 33],
3491 depths: vec![5, 6, 7, 8],
3492 errors: vec![0, 1, 0, 1],
3493 raw_read_count: 5,
3494 ref_start: 100,
3495 ref_end: 103,
3496 is_negative_strand: false,
3497 };
3498
3499 let padded = CodecConsensusCaller::pad_consensus(&ss, 4, false);
3500
3501 assert_eq!(padded.bases, ss.bases);
3503 assert_eq!(padded.quals, ss.quals);
3504 }
3505
3506 #[test]
3507 fn test_pad_consensus_shorter_target() {
3508 let ss = SingleStrandConsensus {
3510 bases: b"ACGT".to_vec(),
3511 quals: vec![30, 31, 32, 33],
3512 depths: vec![5, 6, 7, 8],
3513 errors: vec![0, 1, 0, 1],
3514 raw_read_count: 5,
3515 ref_start: 100,
3516 ref_end: 103,
3517 is_negative_strand: false,
3518 };
3519
3520 let padded = CodecConsensusCaller::pad_consensus(&ss, 2, false);
3521
3522 assert_eq!(padded.bases.len(), 4);
3524 }
3525
3526 #[test]
3527 fn test_mask_consensus_quals_query_based_single_strand() {
3528 let options = CodecConsensusOptions { single_strand_qual: Some(10), ..Default::default() };
3530 let caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
3531
3532 let consensus = SingleStrandConsensus {
3533 bases: b"ACGT".to_vec(),
3534 quals: vec![30, 30, 30, 30],
3535 depths: vec![5; 4],
3536 errors: vec![0; 4],
3537 raw_read_count: 5,
3538 ref_start: 0,
3539 ref_end: 3,
3540 is_negative_strand: false,
3541 };
3542
3543 let padded_r1 = SingleStrandConsensus {
3545 bases: b"NCGT".to_vec(),
3546 quals: vec![0, 30, 30, 30],
3547 depths: vec![0, 5, 5, 5],
3548 errors: vec![0; 4],
3549 raw_read_count: 5,
3550 ref_start: 0,
3551 ref_end: 3,
3552 is_negative_strand: false,
3553 };
3554
3555 let padded_r2 = SingleStrandConsensus {
3557 bases: b"ACGN".to_vec(),
3558 quals: vec![30, 30, 30, 0],
3559 depths: vec![5, 5, 5, 0],
3560 errors: vec![0; 4],
3561 raw_read_count: 5,
3562 ref_start: 0,
3563 ref_end: 3,
3564 is_negative_strand: true,
3565 };
3566
3567 let masked = caller.mask_consensus_quals_query_based(consensus, &padded_r1, &padded_r2);
3568
3569 assert_eq!(masked.quals[0], 10);
3571 assert_eq!(masked.quals[3], 10);
3573 assert_eq!(masked.quals[1], 30);
3575 assert_eq!(masked.quals[2], 30);
3576 }
3577
3578 #[test]
3579 fn test_is_cigar_prefix_exact_match() {
3580 use noodles::sam::alignment::record::cigar::op::Kind;
3581
3582 let a = vec![(Kind::Match, 50)];
3584 let b = vec![(Kind::Match, 50)];
3585 assert!(cigar_utils::is_cigar_prefix(&a, &b));
3586 }
3587
3588 #[test]
3589 fn test_is_cigar_prefix_shorter_last_element() {
3590 use noodles::sam::alignment::record::cigar::op::Kind;
3591
3592 let a = vec![(Kind::Match, 40)];
3594 let b = vec![(Kind::Match, 50)];
3595 assert!(cigar_utils::is_cigar_prefix(&a, &b));
3596 }
3597
3598 #[test]
3599 fn test_is_cigar_prefix_longer_not_prefix() {
3600 use noodles::sam::alignment::record::cigar::op::Kind;
3601
3602 let a = vec![(Kind::Match, 50)];
3604 let b = vec![(Kind::Match, 40)];
3605 assert!(!cigar_utils::is_cigar_prefix(&a, &b));
3606 }
3607
3608 #[test]
3609 fn test_is_cigar_prefix_different_ops() {
3610 use noodles::sam::alignment::record::cigar::op::Kind;
3611
3612 let a = vec![(Kind::Match, 50)];
3614 let b = vec![(Kind::Insertion, 50)];
3615 assert!(!cigar_utils::is_cigar_prefix(&a, &b));
3616 }
3617
3618 #[test]
3619 fn test_is_cigar_prefix_complex() {
3620 use noodles::sam::alignment::record::cigar::op::Kind;
3621
3622 let a = vec![(Kind::Match, 25), (Kind::Deletion, 5), (Kind::Match, 20)];
3624 let b = vec![(Kind::Match, 25), (Kind::Deletion, 5), (Kind::Match, 30)];
3625 assert!(cigar_utils::is_cigar_prefix(&a, &b));
3626
3627 let c = vec![(Kind::Match, 25), (Kind::Deletion, 3), (Kind::Match, 20)];
3629 assert!(!cigar_utils::is_cigar_prefix(&c, &b));
3630 }
3631
3632 #[test]
3633 fn test_reverse_complement_ss_depths_errors_reversed() {
3634 let ss = SingleStrandConsensus {
3636 bases: b"ACGT".to_vec(),
3637 quals: vec![10, 20, 30, 40],
3638 depths: vec![1, 2, 3, 4],
3639 errors: vec![5, 6, 7, 8],
3640 raw_read_count: 5,
3641 ref_start: 100,
3642 ref_end: 103,
3643 is_negative_strand: false,
3644 };
3645
3646 let rc = CodecConsensusCaller::reverse_complement_ss(&ss);
3647
3648 assert_eq!(rc.bases, b"ACGT"); assert_eq!(rc.quals, vec![40, 30, 20, 10]);
3653
3654 assert_eq!(rc.depths, vec![4, 3, 2, 1]);
3656 assert_eq!(rc.errors, vec![8, 7, 6, 5]);
3657
3658 assert!(rc.is_negative_strand);
3660 }
3661
3662 #[test]
3663 fn test_downsample_pairs() {
3664 use noodles::sam::alignment::record::cigar::op::Kind;
3665
3666 let options = CodecConsensusOptions { max_reads_per_strand: Some(2), ..Default::default() };
3667 let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
3668
3669 let r1s: Vec<RecordBuf> = (0..5)
3671 .map(|i| {
3672 create_test_paired_read(
3673 &format!("r{i}"),
3674 &[b'A'; 20],
3675 &[30; 20],
3676 true,
3677 false,
3678 true,
3679 100,
3680 &[(Kind::Match, 20)],
3681 )
3682 })
3683 .collect();
3684 let r2s: Vec<RecordBuf> = (0..5)
3685 .map(|i| {
3686 create_test_paired_read(
3687 &format!("r{i}"),
3688 &[b'A'; 20],
3689 &[30; 20],
3690 false,
3691 true,
3692 false,
3693 110,
3694 &[(Kind::Match, 20)],
3695 )
3696 })
3697 .collect();
3698
3699 let (ds_r1s, ds_r2s) = caller.downsample_pairs(r1s, r2s);
3700
3701 assert_eq!(ds_r1s.len(), 2);
3702 assert_eq!(ds_r2s.len(), 2);
3703 }
3704
3705 #[test]
3706 fn test_downsample_pairs_no_limit() {
3707 use noodles::sam::alignment::record::cigar::op::Kind;
3708
3709 let options = CodecConsensusOptions { max_reads_per_strand: None, ..Default::default() };
3710 let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
3711
3712 let r1s: Vec<RecordBuf> = (0..5)
3713 .map(|i| {
3714 create_test_paired_read(
3715 &format!("r{i}"),
3716 &[b'A'; 20],
3717 &[30; 20],
3718 true,
3719 false,
3720 true,
3721 100,
3722 &[(Kind::Match, 20)],
3723 )
3724 })
3725 .collect();
3726 let r2s: Vec<RecordBuf> = (0..5)
3727 .map(|i| {
3728 create_test_paired_read(
3729 &format!("r{i}"),
3730 &[b'A'; 20],
3731 &[30; 20],
3732 false,
3733 true,
3734 false,
3735 110,
3736 &[(Kind::Match, 20)],
3737 )
3738 })
3739 .collect();
3740
3741 let (ds_r1s, ds_r2s) = caller.downsample_pairs(r1s, r2s);
3742
3743 assert_eq!(ds_r1s.len(), 5);
3745 assert_eq!(ds_r2s.len(), 5);
3746 }
3747
3748 #[test]
3749 fn test_compute_codec_consensus_length() {
3750 use noodles::sam::alignment::record::cigar::op::Kind;
3751
3752 let pos_read = create_test_paired_read(
3756 "pos",
3757 &[b'A'; 30],
3758 &[30; 30],
3759 true,
3760 false,
3761 true,
3762 100,
3763 &[(Kind::Match, 30)],
3764 );
3765 let neg_read = create_test_paired_read(
3766 "neg",
3767 &[b'A'; 30],
3768 &[30; 30],
3769 false,
3770 true,
3771 false,
3772 110,
3773 &[(Kind::Match, 30)],
3774 );
3775
3776 let length =
3778 CodecConsensusCaller::compute_codec_consensus_length(&pos_read, &neg_read, 129);
3779
3780 assert_eq!(length, Some(40));
3785 }
3786
3787 #[test]
3788 fn test_compute_codec_consensus_length_in_deletion() {
3789 use noodles::sam::alignment::record::cigar::op::Kind;
3790
3791 let pos_read = create_test_paired_read(
3793 "pos",
3794 &[b'A'; 30],
3795 &[30; 30],
3796 true,
3797 false,
3798 true,
3799 100,
3800 &[(Kind::Match, 15), (Kind::Deletion, 5), (Kind::Match, 15)],
3801 );
3802 let neg_read = create_test_paired_read(
3803 "neg",
3804 &[b'A'; 30],
3805 &[30; 30],
3806 false,
3807 true,
3808 false,
3809 100,
3810 &[(Kind::Match, 30)],
3811 );
3812
3813 let length =
3815 CodecConsensusCaller::compute_codec_consensus_length(&pos_read, &neg_read, 116);
3816
3817 assert!(length.is_none());
3819 }
3820
3821 #[test]
3822 fn test_reject_records_tracking() {
3823 use noodles::sam::alignment::record::cigar::op::Kind;
3824
3825 let options = CodecConsensusOptions {
3826 min_reads_per_strand: 2, ..Default::default()
3828 };
3829 let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
3830
3831 let reads = create_fr_pair(
3833 "read1",
3834 100,
3835 110,
3836 30,
3837 35,
3838 &[(Kind::Match, 30)],
3839 &[(Kind::Match, 30)],
3840 "UMI1",
3841 None,
3842 false,
3843 true,
3844 );
3845
3846 let output = caller.consensus_reads_from_sam_records(reads).unwrap();
3847
3848 assert_eq!(output.count, 0);
3849 assert!(
3850 caller.stats.rejection_reasons.contains_key(&CallerRejectionReason::InsufficientReads)
3851 );
3852 }
3853
3854 #[test]
3855 fn test_rejected_reads_tracking_enabled() {
3856 use noodles::sam::alignment::record::cigar::op::Kind;
3857
3858 let options = CodecConsensusOptions {
3859 min_reads_per_strand: 2, ..Default::default()
3861 };
3862 let mut caller = CodecConsensusCaller::new_with_rejects_tracking(
3863 "codec".to_string(),
3864 "RG1".to_string(),
3865 options,
3866 true, );
3868
3869 let reads = create_fr_pair(
3870 "read1",
3871 100,
3872 110,
3873 30,
3874 35,
3875 &[(Kind::Match, 30)],
3876 &[(Kind::Match, 30)],
3877 "UMI1",
3878 None,
3879 false,
3880 true,
3881 );
3882
3883 let output = caller.consensus_reads_from_sam_records(reads).unwrap();
3884
3885 assert_eq!(output.count, 0);
3886 assert!(!caller.rejected_reads().is_empty() || caller.statistics().reads_filtered > 0);
3888 }
3889
3890 #[test]
3891 fn test_clear_rejected_reads() {
3892 let options = CodecConsensusOptions::default();
3893 let mut caller = CodecConsensusCaller::new_with_rejects_tracking(
3894 "codec".to_string(),
3895 "RG1".to_string(),
3896 options,
3897 true,
3898 );
3899
3900 assert!(caller.rejected_reads().is_empty());
3902
3903 caller.clear_rejected_reads();
3905 assert!(caller.rejected_reads().is_empty());
3906 }
3907
3908 #[test]
3913 fn test_read_pos_at_ref_pos_with_hard_clip() {
3914 let record = RecordBuilder::new()
3916 .sequence("ACGTA") .alignment_start(100)
3918 .cigar("2H5M2H")
3919 .build();
3920
3921 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&record, 100, false), Some(1));
3923 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&record, 104, false), Some(5));
3925 }
3926
3927 #[test]
3928 fn test_read_pos_at_ref_pos_deletion_at_start() {
3929 let record = RecordBuilder::new()
3933 .sequence("ACGTA") .alignment_start(100)
3935 .cigar("2D5M")
3936 .build();
3937
3938 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&record, 100, true), Some(1));
3941 assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&record, 100, false), None);
3942 }
3943
3944 #[test]
3945 fn test_check_overlap_phase_matching_cigars() {
3946 let options = CodecConsensusOptions::default();
3947 let caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
3948
3949 let r1 = RecordBuilder::new()
3951 .sequence(&"A".repeat(50))
3952 .alignment_start(100)
3953 .cigar("50M")
3954 .build();
3955
3956 let r2 = RecordBuilder::new()
3957 .sequence(&"A".repeat(50))
3958 .alignment_start(120)
3959 .cigar("50M")
3960 .build();
3961
3962 assert!(caller.check_overlap_phase(&r1, &r2, 120, 149));
3964 }
3965
3966 #[test]
3967 fn test_check_overlap_phase_deletion_mismatch() {
3968 let options = CodecConsensusOptions::default();
3969 let caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
3970
3971 let r1 = RecordBuilder::new()
3973 .sequence(&"A".repeat(50))
3974 .alignment_start(100)
3975 .cigar("50M")
3976 .build();
3977
3978 let r2 = RecordBuilder::new()
3980 .sequence(&"A".repeat(48)) .alignment_start(120)
3982 .cigar("15M2D33M")
3983 .build();
3984
3985 assert!(!caller.check_overlap_phase(&r1, &r2, 120, 149));
3987 }
3988}