Skip to main content

fgumi_consensus/
codec_caller.rs

1//! # CODEC Consensus Calling
2//!
3//! This module implements consensus calling for CODEC sequencing protocol, where a single
4//! read-pair sequences both strands of the original duplex molecule.
5//!
6//! ## Overview
7//!
8//! CODEC (Bae et al 2023: <https://doi.org/10.1038/s41588-023-01376-0>) is a sequencing
9//! protocol where:
10//! - Each read-pair sequences **both** strands of the original duplex molecule
11//! - R1 comes from one strand, R2 comes from the opposite strand
12//! - Even a single read-pair can generate duplex consensus
13//! - Output is a **single fragment read** per duplex (not paired-end)
14//!
15//! ## Key Differences from Regular Duplex
16//!
17//! - **Regular Duplex**: Multiple read-pairs, separate /A and /B families, pairs consensus
18//! - **CODEC**: Single read-pair can work, R1 and R2 are inherently from opposite strands
19//! - **Output**: Single fragment (not pair) representing the full duplex molecule
20//!
21//! ## Algorithm Overview
22//!
23//! The CODEC consensus calling process involves these steps:
24//!
25//! 1. **Clip** read pairs where they extend past mate ends
26//! 2. **Filter** R1s and R2s separately for compatible CIGARs
27//! 3. **Check overlap** - must have sufficient overlap on genome
28//! 4. **Build single-strand consensus** from R1s separately from R2s
29//! 5. **Reverse complement** one SS read to align orientations
30//! 6. **Pad with Ns** if reads don't fully overlap
31//! 7. **Build duplex consensus** from the two SS consensus reads
32//! 8. **Apply quality masking** based on single-strand vs duplex regions
33//! 9. **Reverse complement** back if original R1s were on negative strand
34//!
35//! ## Critical Requirements
36//!
37//! - Reads must be **mapped** and **overlap** on the genome
38//! - Uses genome alignment to synchronize reads for duplex calling
39//! - Rejects unmapped, chimeric, or non-overlapping pairs
40//!
41//! ## Usage Example
42//!
43//! ```rust,ignore
44//! use fgumi_lib::consensus::codec_caller::{CodecConsensusCaller, CodecConsensusOptions};
45//!
46//! let options = CodecConsensusOptions {
47//!     min_reads_per_strand: 1,
48//!     min_duplex_length: 1,
49//!     ..Default::default()
50//! };
51//!
52//! let mut caller = CodecConsensusCaller::new(
53//!     "codec".to_string(),
54//!     "RG1".to_string(),
55//!     options,
56//! );
57//!
58//! // Process reads - R1s and R2s from same molecule
59//! let consensus = caller.consensus_reads_from_sam_records(reads)?;
60//! ```
61//!
62//! ## Output Format
63//!
64//! - **Unmapped**: Output reads are unmapped fragments
65//! - **Single fragment**: Not paired-end
66//! - **Tags added** (similar to duplex):
67//!   - `aD, bD, cD` - Max depth (int)
68//!   - `aM, bM, cM` - Min depth (int)
69//!   - `aE, bE, cE` - Error rate (float)
70//!   - `ad, bd` - Per-base depth (short[])
71//!   - `ae, be` - Per-base errors (short[])
72//!   - `ac, bc` - Single-strand consensus bases (string)
73//!   - `aq, bq` - Single-strand consensus qualities (string)
74
75use crate::caller::{
76    ConsensusCaller, ConsensusCallingStats, ConsensusOutput,
77    RejectionReason as CallerRejectionReason,
78};
79use crate::phred::{MIN_PHRED, NO_CALL_BASE, NO_CALL_BASE_LOWER, PhredScore};
80use crate::simple_umi::consensus_umis;
81use crate::vanilla_caller::{
82    VanillaConsensusRead, VanillaUmiConsensusCaller, VanillaUmiConsensusOptions,
83};
84use crate::{IndexedSourceRead, SourceRead, select_most_common_alignment_group};
85use anyhow::Result;
86use fgumi_dna::dna::reverse_complement;
87use fgumi_raw_bam::{self as bam_fields, UnmappedBamRecordBuilder, flags};
88use noodles::sam::alignment::record::data::field::Tag;
89use rand::SeedableRng;
90use rand::rngs::StdRng;
91use rand::seq::SliceRandom;
92use std::cmp::{max, min};
93use std::collections::HashMap;
94
95/// Options for CODEC consensus calling
96#[derive(Debug, Clone)]
97pub struct CodecConsensusOptions {
98    /// Minimum base quality to include in consensus
99    pub min_input_base_quality: PhredScore,
100
101    /// Pre-UMI error rate (Phred scale)
102    pub error_rate_pre_umi: PhredScore,
103
104    /// Post-UMI error rate (Phred scale)
105    pub error_rate_post_umi: PhredScore,
106
107    /// Minimum number of read pairs required to form consensus per strand
108    pub min_reads_per_strand: usize,
109
110    /// Maximum number of read pairs to use per strand (downsample if exceeded)
111    pub max_reads_per_strand: Option<usize>,
112
113    /// Minimum duplex overlap length (in bases)
114    pub min_duplex_length: usize,
115
116    /// Reduce quality of single-strand regions to this value (if set)
117    pub single_strand_qual: Option<PhredScore>,
118
119    /// Reduce quality of outer bases to this value (if set)
120    pub outer_bases_qual: Option<PhredScore>,
121
122    /// Number of outer bases to reduce quality for
123    pub outer_bases_length: usize,
124
125    /// Maximum number of duplex disagreements allowed
126    pub max_duplex_disagreements: usize,
127
128    /// Maximum duplex disagreement rate allowed (0.0-1.0)
129    pub max_duplex_disagreement_rate: f64,
130
131    /// Cell barcode tag (e.g., "CB")
132    pub cell_tag: Option<Tag>,
133
134    /// Whether to produce per-base tags (ad, ae, bd, be, etc.)
135    pub produce_per_base_tags: bool,
136
137    /// Whether to quality-trim reads before consensus calling
138    pub trim: bool,
139
140    /// Minimum consensus base quality (output bases below this are masked to N)
141    pub min_consensus_base_quality: PhredScore,
142}
143
144impl Default for CodecConsensusOptions {
145    fn default() -> Self {
146        Self {
147            min_input_base_quality: 10,
148            error_rate_pre_umi: 45,
149            error_rate_post_umi: 40,
150            min_reads_per_strand: 1,
151            max_reads_per_strand: None,
152            min_duplex_length: 1,
153            single_strand_qual: None,
154            outer_bases_qual: None,
155            outer_bases_length: 5,
156            max_duplex_disagreements: usize::MAX,
157            max_duplex_disagreement_rate: 1.0,
158            cell_tag: None,
159            produce_per_base_tags: false,
160            trim: false,
161            min_consensus_base_quality: 0,
162        }
163    }
164}
165
166/// Statistics for CODEC consensus calling
167#[derive(Debug, Clone, Default)]
168pub struct CodecConsensusStats {
169    /// Total input reads processed
170    pub total_input_reads: u64,
171
172    /// Total consensus reads generated
173    pub consensus_reads_generated: u64,
174
175    /// Total reads filtered/rejected
176    pub reads_filtered: u64,
177
178    /// Consensus reads rejected for high duplex disagreement
179    pub consensus_reads_rejected_hdd: u64,
180
181    /// Total consensus bases emitted
182    pub consensus_bases_emitted: u64,
183
184    /// Total duplex region bases emitted
185    pub consensus_duplex_bases_emitted: u64,
186
187    /// Total duplex disagreement bases
188    pub duplex_disagreement_base_count: u64,
189
190    /// Rejection reasons
191    pub rejection_reasons: HashMap<CallerRejectionReason, usize>,
192}
193
194impl CodecConsensusStats {
195    /// Calculates the duplex disagreement rate
196    #[must_use]
197    pub fn duplex_disagreement_rate(&self) -> f64 {
198        if self.consensus_duplex_bases_emitted > 0 {
199            #[expect(
200                clippy::cast_precision_loss,
201                reason = "acceptable precision loss for rate computation"
202            )]
203            {
204                self.duplex_disagreement_base_count as f64
205                    / self.consensus_duplex_bases_emitted as f64
206            }
207        } else {
208            0.0
209        }
210    }
211}
212
213/// Single-strand consensus read (intermediate result for internal use only)
214///
215/// This struct is used internally by `CodecConsensusCaller` during duplex
216/// consensus building. It is NOT exported from this module - external code
217/// should use `VanillaConsensusRead` instead.
218#[derive(Debug, Clone)]
219struct SingleStrandConsensus {
220    /// Consensus sequence bases
221    bases: Vec<u8>,
222    /// Consensus quality scores
223    quals: Vec<PhredScore>,
224    /// Per-base read depth
225    depths: Vec<u16>,
226    /// Per-base error count
227    errors: Vec<u16>,
228    /// Total raw read count
229    raw_read_count: usize,
230    /// Reference start position (1-based)
231    ref_start: usize,
232    /// Reference end position (1-based, inclusive)
233    ref_end: usize,
234    /// Whether the original reads were on the negative strand
235    is_negative_strand: bool,
236}
237
238/// Precomputed clip metadata for a single record in the raw-byte codec pipeline.
239///
240/// Holds all information needed for filtering, overlap calculation, and
241/// `SourceRead` construction without materializing a `RecordBuf`.
242struct ClippedRecordInfo {
243    /// Index into the original raw records vec.
244    raw_idx: usize,
245    /// Number of query bases to clip.
246    clip_amount: usize,
247    /// `true` ⟹ clip from start (negative-strand reads).
248    clip_from_start: bool,
249    /// Sequence length after clipping.
250    clipped_seq_len: usize,
251    /// CIGAR ops after clipping (raw u32 words).
252    clipped_cigar: Vec<u32>,
253    /// 1-based alignment start after clipping.
254    adjusted_pos: usize,
255    /// BAM flags.
256    flags: u16,
257}
258
259/// CODEC consensus caller
260///
261/// Calls consensus from CODEC sequencing data where R1 and R2 from a single read-pair
262/// sequence opposite strands of the same molecule.
263pub struct CodecConsensusCaller {
264    /// Prefix for consensus read names
265    read_name_prefix: String,
266
267    /// Read group ID for consensus reads
268    read_group_id: String,
269
270    /// Consensus calling options
271    options: CodecConsensusOptions,
272
273    /// Statistics tracker
274    stats: CodecConsensusStats,
275
276    /// Random number generator for downsampling
277    rng: StdRng,
278
279    /// Counter for consensus read naming
280    consensus_counter: u64,
281
282    /// Single-strand consensus caller (matches fgbio's ssCaller delegation pattern)
283    /// CODEC delegates to `VanillaUmiConsensusCaller` for single-strand consensus building
284    ss_caller: VanillaUmiConsensusCaller,
285
286    /// Reusable builder for raw-byte BAM record output
287    bam_builder: UnmappedBamRecordBuilder,
288
289    /// Whether to store rejected reads (raw bytes) for output
290    track_rejects: bool,
291
292    /// Rejected raw BAM records (only populated when `track_rejects` is true)
293    rejected_reads: Vec<Vec<u8>>,
294}
295
296#[expect(
297    clippy::similar_names,
298    clippy::cast_precision_loss,
299    clippy::cast_possible_truncation,
300    clippy::cast_possible_wrap,
301    clippy::cast_sign_loss,
302    reason = "consensus calling uses numeric casts for quality/position math and has domain-specific variable names"
303)]
304impl CodecConsensusCaller {
305    /// Creates a new CODEC consensus caller
306    ///
307    /// # Arguments
308    /// * `read_name_prefix` - Prefix for consensus read names
309    /// * `read_group_id` - Read group ID for consensus reads
310    /// * `options` - CODEC consensus calling options
311    #[must_use]
312    pub fn new(
313        read_name_prefix: String,
314        read_group_id: String,
315        options: CodecConsensusOptions,
316    ) -> Self {
317        let rng = StdRng::seed_from_u64(42);
318
319        // Create VanillaUmiConsensusCaller for single-strand consensus building
320        // Matches fgbio's ssCaller initialization in DuplexConsensusCaller:
321        // - minReads = 1 (CODEC handles filtering itself)
322        // - producePerBaseTags = true (needed for duplex consensus calculation)
323        // - minConsensusBaseQuality = MIN (we handle quality masking ourselves)
324        let ss_options = VanillaUmiConsensusOptions {
325            tag: "MI".to_string(),
326            error_rate_pre_umi: options.error_rate_pre_umi,
327            error_rate_post_umi: options.error_rate_post_umi,
328            min_input_base_quality: options.min_input_base_quality,
329            min_reads: 1, // CODEC handles read filtering
330            max_reads: None,
331            produce_per_base_tags: true,
332            seed: None,
333            trim: false,
334            min_consensus_base_quality: 0, // MIN - we handle quality masking
335            cell_tag: None,
336        };
337
338        let ss_caller = VanillaUmiConsensusCaller::new(
339            "x".to_string(), // Temporary prefix (not used in final output)
340            read_group_id.clone(),
341            ss_options,
342        );
343
344        Self {
345            read_name_prefix,
346            read_group_id,
347            options,
348            stats: CodecConsensusStats::default(),
349            rng,
350            consensus_counter: 0,
351            ss_caller,
352            bam_builder: UnmappedBamRecordBuilder::new(),
353            track_rejects: false,
354            rejected_reads: Vec::new(),
355        }
356    }
357
358    /// Creates a new CODEC consensus caller with optional rejected-reads tracking.
359    ///
360    /// When `track_rejects` is `true`, raw BAM bytes of rejected reads are stored
361    /// and can be retrieved via [`rejected_reads`] / [`take_rejected_reads`].
362    #[must_use]
363    pub fn new_with_rejects_tracking(
364        read_name_prefix: String,
365        read_group_id: String,
366        options: CodecConsensusOptions,
367        track_rejects: bool,
368    ) -> Self {
369        let mut caller = Self::new(read_name_prefix, read_group_id, options);
370        caller.track_rejects = track_rejects;
371        caller
372    }
373
374    /// Returns the statistics for this caller
375    #[must_use]
376    pub fn statistics(&self) -> &CodecConsensusStats {
377        &self.stats
378    }
379
380    /// Clears all per-group state to prepare for reuse
381    ///
382    /// This resets statistics while preserving the caller's
383    /// configuration and reusable components (consensus builder, RNG, etc.).
384    pub fn clear(&mut self) {
385        self.stats = CodecConsensusStats::default();
386        self.ss_caller.clear();
387        self.rejected_reads.clear();
388    }
389
390    /// Returns a reference to the rejected reads (raw BAM bytes).
391    #[must_use]
392    pub fn rejected_reads(&self) -> &[Vec<u8>] {
393        &self.rejected_reads
394    }
395
396    /// Clears the stored rejected reads.
397    pub fn clear_rejected_reads(&mut self) {
398        self.rejected_reads.clear();
399    }
400
401    /// Takes ownership of the stored rejected reads, leaving an empty vec.
402    pub fn take_rejected_reads(&mut self) -> Vec<Vec<u8>> {
403        std::mem::take(&mut self.rejected_reads)
404    }
405
406    /// Converts raw BAM bytes to `SourceRead` for CODEC consensus calling,
407    /// applying virtual clipping.
408    ///
409    /// Simpler than vanilla's `create_source_read`: no quality masking,
410    /// no trailing N removal, no quality trimming.
411    fn to_source_read_for_codec_raw(
412        raw: &[u8],
413        original_idx: usize,
414        clip_amount: usize,
415        clip_from_start: bool,
416        clipped_cigar: Option<&[u32]>,
417    ) -> SourceRead {
418        let mut bases = bam_fields::extract_sequence(raw);
419        let mut quals = bam_fields::quality_scores_slice(raw).to_vec();
420        let flg = bam_fields::flags(raw);
421
422        // Apply clipping: truncate bases and quals
423        // Clamp clip_amount to avoid panic on malformed input (e.g., CIGAR/MC mismatch)
424        let clip_amount = clip_amount.min(bases.len());
425        if clip_amount > 0 {
426            if clip_from_start {
427                bases = bases[clip_amount..].to_vec();
428                quals = quals[clip_amount..].to_vec();
429            } else {
430                let new_len = bases.len() - clip_amount;
431                bases.truncate(new_len);
432                quals.truncate(new_len);
433            }
434        }
435
436        // Get simplified CIGAR — use pre-clipped CIGAR if available to avoid redundant work
437        let mut simplified = if let Some(ops) = clipped_cigar {
438            bam_fields::simplify_cigar_from_raw(ops)
439        } else {
440            let original_ops = bam_fields::get_cigar_ops(raw);
441            let (clipped_ops, _) =
442                bam_fields::clip_cigar_ops_raw(&original_ops, clip_amount, clip_from_start);
443            bam_fields::simplify_cigar_from_raw(&clipped_ops)
444        };
445
446        let is_negative = flg & flags::REVERSE != 0;
447        if is_negative {
448            simplified.reverse();
449            bases = reverse_complement(&bases);
450            quals.reverse();
451        }
452
453        SourceRead { original_idx, bases, quals, simplified_cigar: simplified, flags: flg }
454    }
455
456    /// Converts a `VanillaConsensusRead` to a `SingleStrandConsensus`.
457    ///
458    /// This bridges the delegation pattern: `CodecConsensusCaller` delegates to
459    /// `VanillaUmiConsensusCaller` for single-strand consensus, then converts
460    /// the result to CODEC's internal `SingleStrandConsensus` type.
461    fn vanilla_to_single_strand(
462        vcr: VanillaConsensusRead,
463        is_negative_strand: bool,
464        raw_read_count: usize,
465    ) -> SingleStrandConsensus {
466        let len = vcr.bases.len();
467        SingleStrandConsensus {
468            bases: vcr.bases,
469            quals: vcr.quals,
470            depths: vcr.depths,
471            errors: vcr.errors,
472            raw_read_count,
473            ref_start: 0,
474            ref_end: len.saturating_sub(1),
475            is_negative_strand,
476        }
477    }
478
479    /// Reverse complements a `SingleStrandConsensus`
480    ///
481    /// This converts the bases from the R2 orientation (reverse complement of reference)
482    /// to the R1 orientation (matching reference).
483    ///
484    /// For CODEC (query-based consensus):
485    /// - Bases, quals, depths, and errors are ALL reversed (they're in query space)
486    /// - This ensures depths/errors stay aligned with the reversed bases
487    fn reverse_complement_ss(ss: &SingleStrandConsensus) -> SingleStrandConsensus {
488        SingleStrandConsensus {
489            bases: reverse_complement(&ss.bases),
490            quals: ss.quals.iter().copied().rev().collect(),
491            // For CODEC query-based consensus, depths and errors must also be reversed
492            // to stay aligned with the reversed bases
493            depths: ss.depths.iter().copied().rev().collect(),
494            errors: ss.errors.iter().copied().rev().collect(),
495            raw_read_count: ss.raw_read_count,
496            ref_start: ss.ref_start,
497            ref_end: ss.ref_end,
498            is_negative_strand: !ss.is_negative_strand,
499        }
500    }
501
502    /// Calls consensus reads from raw BAM byte records for a single molecule.
503    ///
504    /// This is the primary entry point, working directly on raw bytes without
505    /// materializing `RecordBuf`. Virtual clipping is computed and applied as
506    /// offsets when extracting data for consensus.
507    #[expect(
508        clippy::too_many_lines,
509        reason = "consensus pipeline has many sequential steps that are clearest in one function"
510    )]
511    fn consensus_reads_raw(&mut self, records: &[Vec<u8>]) -> Result<ConsensusOutput> {
512        self.stats.total_input_reads += records.len() as u64;
513
514        if records.is_empty() {
515            return Ok(ConsensusOutput::default());
516        }
517
518        // Extract MI tag from first record for naming
519        let umi: Option<String> = bam_fields::find_string_tag_in_record(&records[0], b"MI")
520            .map(|b| String::from_utf8_lossy(b).to_string());
521
522        // Phase 1: Filter on raw bytes — keep paired, primary, mapped, FR-pair reads
523        let mut paired_indices: Vec<usize> = Vec::new();
524        let mut frag_count = 0usize;
525        for (i, raw) in records.iter().enumerate() {
526            let flg = bam_fields::flags(raw);
527            if flg & flags::PAIRED == 0 {
528                frag_count += 1;
529                continue;
530            }
531            // Filter: primary, mapped, FR pair
532            if flg & (flags::SECONDARY | flags::SUPPLEMENTARY | flags::UNMAPPED) != 0 {
533                continue;
534            }
535            if !bam_fields::is_fr_pair_raw(raw) {
536                continue;
537            }
538            paired_indices.push(i);
539        }
540
541        if frag_count > 0 {
542            self.reject_records_count(frag_count, CallerRejectionReason::FragmentRead);
543        }
544
545        if paired_indices.is_empty() {
546            return Ok(ConsensusOutput::default());
547        }
548
549        // Phase 2: Group by name, pair R1/R2, compute clip amounts
550        let mut by_name: HashMap<&[u8], Vec<usize>> = HashMap::new();
551        let mut name_order: Vec<&[u8]> = Vec::new();
552        for &idx in &paired_indices {
553            let name = bam_fields::read_name(&records[idx]);
554            if !by_name.contains_key(name) {
555                name_order.push(name);
556            }
557            by_name.entry(name).or_default().push(idx);
558        }
559
560        let mut r1_infos: Vec<ClippedRecordInfo> = Vec::new();
561        let mut r2_infos: Vec<ClippedRecordInfo> = Vec::new();
562
563        for name in name_order {
564            let indices = by_name.get(name).expect("name from iteration");
565            if indices.len() != 2 {
566                continue;
567            }
568
569            let (i1, i2) = {
570                let flg0 = bam_fields::flags(&records[indices[0]]);
571                if flg0 & flags::FIRST_SEGMENT != 0 {
572                    (indices[0], indices[1])
573                } else {
574                    (indices[1], indices[0])
575                }
576            };
577
578            // Compute clip amounts from raw bytes
579            let clip_r1 = bam_fields::num_bases_extending_past_mate_raw(&records[i1]);
580            let clip_r2 = bam_fields::num_bases_extending_past_mate_raw(&records[i2]);
581
582            // Build ClippedRecordInfo for R1
583            let r1_info = Self::build_clipped_info(&records[i1], i1, clip_r1);
584            // Build ClippedRecordInfo for R2
585            let r2_info = Self::build_clipped_info(&records[i2], i2, clip_r2);
586
587            r1_infos.push(r1_info);
588            r2_infos.push(r2_info);
589        }
590
591        if r1_infos.is_empty() {
592            return Ok(ConsensusOutput::default());
593        }
594
595        // Check we have enough reads
596        if r1_infos.len() < self.options.min_reads_per_strand {
597            self.reject_records_count(
598                r1_infos.len() + r2_infos.len(),
599                CallerRejectionReason::InsufficientReads,
600            );
601            return Ok(ConsensusOutput::default());
602        }
603
604        // Downsample if needed
605        if let Some(max_reads) = self.options.max_reads_per_strand {
606            if r1_infos.len() > max_reads {
607                let mut indices: Vec<usize> = (0..r1_infos.len()).collect();
608                indices.shuffle(&mut self.rng);
609                indices.truncate(max_reads);
610                indices.sort_unstable();
611
612                let new_r1: Vec<_> = indices
613                    .iter()
614                    .map(|&i| std::mem::replace(&mut r1_infos[i], Self::dummy_info()))
615                    .collect();
616                let new_r2: Vec<_> = indices
617                    .iter()
618                    .map(|&i| std::mem::replace(&mut r2_infos[i], Self::dummy_info()))
619                    .collect();
620                r1_infos = new_r1;
621                r2_infos = new_r2;
622            }
623        }
624
625        // Phase 3: Filter to most common alignment on ClippedRecordInfo
626        let r1_infos = self.filter_to_most_common_alignment_raw(r1_infos);
627        let r2_infos = self.filter_to_most_common_alignment_raw(r2_infos);
628
629        if r1_infos.is_empty() || r2_infos.is_empty() {
630            return Ok(ConsensusOutput::default());
631        }
632
633        if r1_infos.len() < self.options.min_reads_per_strand
634            || r2_infos.len() < self.options.min_reads_per_strand
635        {
636            self.reject_records_count(
637                r1_infos.len() + r2_infos.len(),
638                CallerRejectionReason::InsufficientReads,
639            );
640            return Ok(ConsensusOutput::default());
641        }
642
643        // Phase 4: Overlap/phase calculation on ClippedRecordInfo
644        // Find longest R1 and R2 by reference length (reverse iter for first-max tie-break)
645        let longest_r1 = r1_infos
646            .iter()
647            .rev()
648            .max_by_key(|info| bam_fields::reference_length_from_cigar(&info.clipped_cigar))
649            .expect("non-empty");
650        let longest_r2 = r2_infos
651            .iter()
652            .rev()
653            .max_by_key(|info| bam_fields::reference_length_from_cigar(&info.clipped_cigar))
654            .expect("non-empty");
655
656        let r1_is_negative = longest_r1.flags & flags::REVERSE != 0;
657        let (longest_pos, longest_neg) =
658            if r1_is_negative { (longest_r2, longest_r1) } else { (longest_r1, longest_r2) };
659
660        let neg_start = longest_neg.adjusted_pos;
661        let pos_start = longest_pos.adjusted_pos;
662        let pos_ref_len =
663            bam_fields::reference_length_from_cigar(&longest_pos.clipped_cigar) as usize;
664        let pos_end = pos_start + pos_ref_len.saturating_sub(1);
665
666        let (overlap_start, overlap_end) = (neg_start, pos_end);
667        let duplex_length = overlap_end as i64 - overlap_start as i64 + 1;
668
669        if duplex_length < self.options.min_duplex_length as i64 {
670            self.reject_records_count(
671                r1_infos.len() + r2_infos.len(),
672                CallerRejectionReason::InsufficientOverlap,
673            );
674            return Ok(ConsensusOutput::default());
675        }
676
677        // Phase check using raw CIGAR ops
678        if !Self::check_overlap_phase_raw(longest_r1, longest_r2, overlap_start, overlap_end) {
679            self.reject_records_count(
680                r1_infos.len() + r2_infos.len(),
681                CallerRejectionReason::IndelErrorBetweenStrands,
682            );
683            return Ok(ConsensusOutput::default());
684        }
685
686        let r2_is_negative = longest_r2.flags & flags::REVERSE != 0;
687
688        // Compute consensus length using clipped info
689        let consensus_length =
690            Self::compute_consensus_length_raw(longest_pos, longest_neg, overlap_end);
691        let Some(consensus_length) = consensus_length else {
692            self.reject_records_count(
693                r1_infos.len() + r2_infos.len(),
694                CallerRejectionReason::IndelErrorBetweenStrands,
695            );
696            return Ok(ConsensusOutput::default());
697        };
698
699        // Phase 5: Build SourceReads and call single-strand consensus
700        let r1_is_neg_strand = r1_infos.first().is_some_and(|i| i.flags & flags::REVERSE != 0);
701        let r1_source_reads: Vec<SourceRead> = r1_infos
702            .iter()
703            .enumerate()
704            .map(|(idx, info)| {
705                Self::to_source_read_for_codec_raw(
706                    &records[info.raw_idx],
707                    idx,
708                    info.clip_amount,
709                    info.clip_from_start,
710                    Some(&info.clipped_cigar),
711                )
712            })
713            .collect();
714
715        let umi_str = umi.as_deref().unwrap_or("");
716        let ss_r1 = match self.ss_caller.consensus_call(umi_str, r1_source_reads)? {
717            Some(vcr) => Self::vanilla_to_single_strand(vcr, r1_is_neg_strand, r1_infos.len()),
718            None => return Ok(ConsensusOutput::default()),
719        };
720
721        let r2_is_neg_strand = r2_infos.first().is_some_and(|i| i.flags & flags::REVERSE != 0);
722        let r2_source_reads: Vec<SourceRead> = r2_infos
723            .iter()
724            .enumerate()
725            .map(|(idx, info)| {
726                Self::to_source_read_for_codec_raw(
727                    &records[info.raw_idx],
728                    idx,
729                    info.clip_amount,
730                    info.clip_from_start,
731                    Some(&info.clipped_cigar),
732                )
733            })
734            .collect();
735
736        let ss_r2 = match self.ss_caller.consensus_call(umi_str, r2_source_reads)? {
737            Some(vcr) => Self::vanilla_to_single_strand(vcr, r2_is_neg_strand, r2_infos.len()),
738            None => return Ok(ConsensusOutput::default()),
739        };
740
741        if consensus_length < ss_r1.bases.len() || consensus_length < ss_r2.bases.len() {
742            self.reject_records_count(
743                r1_infos.len() + r2_infos.len(),
744                CallerRejectionReason::IndelErrorBetweenStrands,
745            );
746            return Ok(ConsensusOutput::default());
747        }
748
749        // Orient and pad
750        let (ss_r1_oriented, ss_r2_oriented) = if r1_is_negative {
751            (Self::reverse_complement_ss(&ss_r1), ss_r2.clone())
752        } else {
753            (ss_r1.clone(), Self::reverse_complement_ss(&ss_r2))
754        };
755
756        let padded_r1 = Self::pad_consensus(&ss_r1_oriented, consensus_length, r1_is_negative);
757        let padded_r2 = Self::pad_consensus(&ss_r2_oriented, consensus_length, r2_is_negative);
758
759        let consensus = self.build_duplex_consensus_from_padded(&padded_r1, &padded_r2)?;
760        let consensus = self.mask_consensus_quals_query_based(consensus, &padded_r1, &padded_r2);
761        let consensus =
762            if r1_is_negative { Self::reverse_complement_ss(&consensus) } else { consensus };
763
764        let (ss_for_ac, ss_for_bc) = if r1_is_negative {
765            (Self::reverse_complement_ss(&padded_r1), Self::reverse_complement_ss(&padded_r2))
766        } else {
767            (padded_r1.clone(), padded_r2.clone())
768        };
769
770        // Phase 6: Build output from raw bytes
771        // Collect raw record refs from filtered reads only (r1_infos/r2_infos)
772        // for tags derived from consensus source reads (cell barcode, etc.)
773        let all_paired_raws: Vec<&[u8]> = r1_infos
774            .iter()
775            .chain(r2_infos.iter())
776            .map(|info| records[info.raw_idx].as_slice())
777            .collect();
778
779        let mut output = ConsensusOutput::default();
780        self.build_output_record_into(
781            &mut output,
782            &consensus,
783            &ss_for_ac,
784            &ss_for_bc,
785            umi.as_deref(),
786            &all_paired_raws,
787            records,
788        )?;
789
790        self.stats.consensus_reads_generated += 1;
791        Ok(output)
792    }
793
794    /// Build `ClippedRecordInfo` for a single raw record.
795    fn build_clipped_info(raw: &[u8], raw_idx: usize, clip_amount: usize) -> ClippedRecordInfo {
796        let flg = bam_fields::flags(raw);
797        let is_reverse = flg & flags::REVERSE != 0;
798        let clip_from_start = is_reverse; // negative strand ⟹ clip from start
799
800        let original_ops = bam_fields::get_cigar_ops(raw);
801        let (clipped_cigar, ref_bases_consumed) =
802            bam_fields::clip_cigar_ops_raw(&original_ops, clip_amount, clip_from_start);
803
804        let original_seq_len = bam_fields::l_seq(raw) as usize;
805        let clipped_seq_len = original_seq_len.saturating_sub(clip_amount);
806
807        // 1-based alignment start, adjusted for start-clipping
808        let pos_0based = bam_fields::pos(raw);
809        debug_assert!(
810            pos_0based >= 0,
811            "build_clipped_info called on unmapped record (pos={pos_0based})"
812        );
813        let adjusted_pos = if clip_from_start {
814            (pos_0based + 1) as usize + ref_bases_consumed
815        } else {
816            (pos_0based + 1) as usize
817        };
818
819        ClippedRecordInfo {
820            raw_idx,
821            clip_amount,
822            clip_from_start,
823            clipped_seq_len,
824            clipped_cigar,
825            adjusted_pos,
826            flags: flg,
827        }
828    }
829
830    /// Dummy `ClippedRecordInfo` used as a swap placeholder during downsample.
831    fn dummy_info() -> ClippedRecordInfo {
832        ClippedRecordInfo {
833            raw_idx: 0,
834            clip_amount: 0,
835            clip_from_start: false,
836            clipped_seq_len: 0,
837            clipped_cigar: Vec::new(),
838            adjusted_pos: 0,
839            flags: 0,
840        }
841    }
842
843    /// Filter `ClippedRecordInfo`s to the most common alignment pattern.
844    fn filter_to_most_common_alignment_raw(
845        &mut self,
846        infos: Vec<ClippedRecordInfo>,
847    ) -> Vec<ClippedRecordInfo> {
848        if infos.len() < 2 {
849            return infos;
850        }
851
852        let mut indexed: Vec<IndexedSourceRead> = infos
853            .iter()
854            .enumerate()
855            .map(|(i, info)| {
856                let mut cigar = bam_fields::simplify_cigar_from_raw(&info.clipped_cigar);
857                if info.flags & flags::REVERSE != 0 {
858                    cigar.reverse();
859                }
860                (i, info.clipped_seq_len, cigar)
861            })
862            .collect();
863
864        indexed.sort_by(|a, b| b.1.cmp(&a.1));
865
866        let best_indices = select_most_common_alignment_group(&indexed);
867
868        let rejected_count = infos.len() - best_indices.len();
869        if rejected_count > 0 {
870            *self
871                .stats
872                .rejection_reasons
873                .entry(CallerRejectionReason::MinorityAlignment)
874                .or_insert(0) += rejected_count;
875            self.stats.reads_filtered += rejected_count as u64;
876        }
877
878        let best_set: std::collections::HashSet<usize> = best_indices.into_iter().collect();
879        infos
880            .into_iter()
881            .enumerate()
882            .filter(|(i, _)| best_set.contains(i))
883            .map(|(_, info)| info)
884            .collect()
885    }
886
887    /// Phase check using `ClippedRecordInfo`.
888    fn check_overlap_phase_raw(
889        r1: &ClippedRecordInfo,
890        r2: &ClippedRecordInfo,
891        overlap_start: usize,
892        overlap_end: usize,
893    ) -> bool {
894        let r1s = bam_fields::read_pos_at_ref_pos_raw(
895            &r1.clipped_cigar,
896            r1.adjusted_pos,
897            overlap_start,
898            true,
899        );
900        let r2s = bam_fields::read_pos_at_ref_pos_raw(
901            &r2.clipped_cigar,
902            r2.adjusted_pos,
903            overlap_start,
904            true,
905        );
906        let r1e = bam_fields::read_pos_at_ref_pos_raw(
907            &r1.clipped_cigar,
908            r1.adjusted_pos,
909            overlap_end,
910            true,
911        );
912        let r2e = bam_fields::read_pos_at_ref_pos_raw(
913            &r2.clipped_cigar,
914            r2.adjusted_pos,
915            overlap_end,
916            true,
917        );
918
919        match (r1s, r2s, r1e, r2e) {
920            (Some(a), Some(b), Some(c), Some(d)) => (a as i64 - b as i64) == (c as i64 - d as i64),
921            _ => false,
922        }
923    }
924
925    /// Compute consensus length from `ClippedRecordInfo`.
926    fn compute_consensus_length_raw(
927        pos_info: &ClippedRecordInfo,
928        neg_info: &ClippedRecordInfo,
929        overlap_end: usize,
930    ) -> Option<usize> {
931        let pos_read_pos = bam_fields::read_pos_at_ref_pos_raw(
932            &pos_info.clipped_cigar,
933            pos_info.adjusted_pos,
934            overlap_end,
935            false,
936        )?;
937        let neg_read_pos = bam_fields::read_pos_at_ref_pos_raw(
938            &neg_info.clipped_cigar,
939            neg_info.adjusted_pos,
940            overlap_end,
941            false,
942        )?;
943
944        Some(pos_read_pos + neg_info.clipped_seq_len - neg_read_pos)
945    }
946
947    /// Pads a single-strand consensus to a new length.
948    /// If `pad_left` is true, pads on the left (prepends), otherwise on the right (appends).
949    /// This matches fgbio's `VanillaConsensusRead.padded` method.
950    ///
951    /// # Lowercase 'n' for padding
952    ///
953    /// Uses lowercase 'n' (`NO_CALL_BASE_LOWER`) for padding bases to match fgbio behavior.
954    /// These padded bases may be reverse complemented later (see `reverse_complement_ss`),
955    /// and `dna::complement_base` preserves the lowercase 'n' through that operation.
956    /// This is specific to CODEC - simplex/duplex use uppercase 'N' for no-call bases.
957    fn pad_consensus(
958        ss: &SingleStrandConsensus,
959        new_length: usize,
960        pad_left: bool,
961    ) -> SingleStrandConsensus {
962        let current_len = ss.bases.len();
963        if new_length <= current_len {
964            return ss.clone();
965        }
966
967        let pad_len = new_length - current_len;
968        let pad_bases = vec![NO_CALL_BASE_LOWER; pad_len]; // Use lowercase 'n' to match fgbio
969        let pad_quals = vec![0u8; pad_len];
970        let pad_depths = vec![0u16; pad_len];
971        let pad_errors = vec![0u16; pad_len];
972
973        let (bases, quals, depths, errors) = if pad_left {
974            (
975                [pad_bases, ss.bases.clone()].concat(),
976                [pad_quals, ss.quals.clone()].concat(),
977                [pad_depths, ss.depths.clone()].concat(),
978                [pad_errors, ss.errors.clone()].concat(),
979            )
980        } else {
981            (
982                [ss.bases.clone(), pad_bases].concat(),
983                [ss.quals.clone(), pad_quals].concat(),
984                [ss.depths.clone(), pad_depths].concat(),
985                [ss.errors.clone(), pad_errors].concat(),
986            )
987        };
988
989        SingleStrandConsensus {
990            bases,
991            quals,
992            depths,
993            errors,
994            raw_read_count: ss.raw_read_count,
995            ref_start: ss.ref_start,
996            ref_end: ss.ref_end,
997            is_negative_strand: ss.is_negative_strand,
998        }
999    }
1000
1001    /// Builds duplex consensus from two query-based single-strand consensuses.
1002    /// The two SS consensuses should already be padded to the same length.
1003    ///
1004    /// Single-strand positions (where only one strand has data) preserve their
1005    /// original qualities from the single-strand consensus, matching fgbio's behavior.
1006    fn build_duplex_consensus_from_padded(
1007        &mut self,
1008        ss_a: &SingleStrandConsensus,
1009        ss_b: &SingleStrandConsensus,
1010    ) -> Result<SingleStrandConsensus> {
1011        let len = ss_a.bases.len();
1012        assert_eq!(ss_b.bases.len(), len, "Padded consensuses must be the same length");
1013
1014        let mut bases = vec![NO_CALL_BASE; len];
1015        let mut quals: Vec<PhredScore> = vec![MIN_PHRED; len];
1016        let mut depths = vec![0u16; len];
1017        let mut errors = vec![0u16; len];
1018
1019        let mut duplex_disagreements = 0usize;
1020        let mut duplex_bases_count = 0usize;
1021
1022        for pos_idx in 0..len {
1023            let ba = ss_a.bases[pos_idx];
1024            let qa = ss_a.quals[pos_idx];
1025            let da = ss_a.depths[pos_idx];
1026            let ea = ss_a.errors[pos_idx];
1027
1028            let bb = ss_b.bases[pos_idx];
1029            let qb = ss_b.quals[pos_idx];
1030            let db = ss_b.depths[pos_idx];
1031            let eb = ss_b.errors[pos_idx];
1032
1033            // Check if this is a duplex position (both strands have data)
1034            // Match fgbio's logic: only check if base is N/n (padding), not quality
1035            // fgbio doesn't check qual > 0 here - it uses whatever qualities are present
1036            let a_has_data = ba != NO_CALL_BASE && ba != NO_CALL_BASE_LOWER;
1037            let b_has_data = bb != NO_CALL_BASE && bb != NO_CALL_BASE_LOWER;
1038
1039            let (duplex_base, duplex_qual, depth, error) = match (a_has_data, b_has_data) {
1040                (true, true) => {
1041                    // Both strands have a base at this position - duplex region
1042                    duplex_bases_count += 1;
1043                    let (raw_base, raw_qual) = if ba == bb {
1044                        // Agreement - sum qualities, cap to 93
1045                        (ba, min(93, u16::from(qa) + u16::from(qb)) as u8)
1046                    } else if qa > qb {
1047                        // Disagreement - take higher quality base, subtract qualities
1048                        duplex_disagreements += 1;
1049                        // Cap to minimum 2 (matching fgbio's PhredScore.cap)
1050                        (ba, max(MIN_PHRED, qa.saturating_sub(qb)))
1051                    } else if qb > qa {
1052                        duplex_disagreements += 1;
1053                        // Cap to minimum 2 (matching fgbio's PhredScore.cap)
1054                        (bb, max(MIN_PHRED, qb.saturating_sub(qa)))
1055                    } else {
1056                        // Equal quality disagreement
1057                        // IMPORTANT: fgbio sets rawBase = aBase (not N!) even though the final
1058                        // base will be masked to N due to low quality. The error calculation
1059                        // uses rawBase (aBase), not the final masked base.
1060                        duplex_disagreements += 1;
1061                        (ba, MIN_PHRED) // Use aBase (ba), not N!
1062                    };
1063
1064                    // Mask to N if quality is MinValue (2), matching fgbio's logic
1065                    let (final_base, final_qual) = if raw_qual == MIN_PHRED {
1066                        (NO_CALL_BASE, MIN_PHRED)
1067                    } else {
1068                        (raw_base, raw_qual)
1069                    };
1070
1071                    // Calculate errors matching fgbio's logic:
1072                    // - When bases agree: sum of both errors
1073                    // - When bases disagree: count errors from the chosen strand + non-errors from other strand
1074                    //   (because the non-errors from the other strand now disagree with the consensus)
1075                    let duplex_error = if ba == bb {
1076                        // Agreement: sum errors from both strands
1077                        ea + eb
1078                    } else if ba == raw_base {
1079                        // We chose A's base (including equal quality disagreements where fgbio uses aBase)
1080                        // Count A's errors + B's non-errors (which now disagree with consensus)
1081                        ea + db.saturating_sub(eb)
1082                    } else {
1083                        // We chose B's base: count B's errors + A's non-errors
1084                        eb + da.saturating_sub(ea)
1085                    };
1086
1087                    (final_base, final_qual, da + db, duplex_error)
1088                }
1089                (true, false) => {
1090                    // Only A has data - single-strand from A
1091                    // Mask to N if quality is MinValue (2), matching fgbio's behavior
1092                    if qa == MIN_PHRED {
1093                        (NO_CALL_BASE, MIN_PHRED, da, ea)
1094                    } else {
1095                        (ba, qa, da, ea)
1096                    }
1097                }
1098                (false, true) => {
1099                    // Only B has data - single-strand from B
1100                    // Mask to N if quality is MinValue (2), matching fgbio's behavior
1101                    if qb == MIN_PHRED {
1102                        (NO_CALL_BASE, MIN_PHRED, db, eb)
1103                    } else {
1104                        (bb, qb, db, eb)
1105                    }
1106                }
1107                (false, false) => {
1108                    // Neither has data - N
1109                    // fgbio still calculates errors for these positions:
1110                    // since both bases are the same (N == N), errors = a.errors + b.errors
1111                    let duplex_error = ea + eb;
1112                    (NO_CALL_BASE, MIN_PHRED, 0, duplex_error)
1113                }
1114            };
1115
1116            // Apply fgbio's NoCall masking: if EITHER original strand base is NoCall (uppercase 'N'),
1117            // mask the final result to N. This is applied AFTER the rawBase calculation.
1118            // Note: fgbio only considers uppercase 'N' as NoCall, not lowercase 'n' (which is used for padding).
1119            let (duplex_base, duplex_qual) = if ba == NO_CALL_BASE || bb == NO_CALL_BASE {
1120                (NO_CALL_BASE, MIN_PHRED)
1121            } else {
1122                (duplex_base, duplex_qual)
1123            };
1124
1125            bases[pos_idx] = duplex_base;
1126            quals[pos_idx] = duplex_qual;
1127            depths[pos_idx] = depth;
1128            errors[pos_idx] = error;
1129        }
1130
1131        // Check duplex disagreement rate
1132        if duplex_bases_count > 0 {
1133            let duplex_error_rate = duplex_disagreements as f64 / duplex_bases_count as f64;
1134            self.stats.consensus_duplex_bases_emitted += duplex_bases_count as u64;
1135            self.stats.duplex_disagreement_base_count += duplex_disagreements as u64;
1136
1137            if duplex_disagreements > self.options.max_duplex_disagreements {
1138                anyhow::bail!("High duplex disagreement: {duplex_disagreements} disagreements");
1139            }
1140            if duplex_error_rate > self.options.max_duplex_disagreement_rate {
1141                anyhow::bail!("High duplex disagreement rate: {duplex_error_rate:.4}");
1142            }
1143        }
1144
1145        Ok(SingleStrandConsensus {
1146            bases,
1147            quals,
1148            depths,
1149            errors,
1150            raw_read_count: ss_a.raw_read_count + ss_b.raw_read_count,
1151            ref_start: 0,
1152            ref_end: len.saturating_sub(1),
1153            is_negative_strand: ss_a.is_negative_strand,
1154        })
1155    }
1156
1157    /// Masks consensus qualities for query-based consensus.
1158    /// Identifies single-strand regions by checking if either padded consensus has 'N'.
1159    /// This matches fgbio's maskCodecConsensusQuals behavior.
1160    fn mask_consensus_quals_query_based(
1161        &self,
1162        mut consensus: SingleStrandConsensus,
1163        padded_r1: &SingleStrandConsensus,
1164        padded_r2: &SingleStrandConsensus,
1165    ) -> SingleStrandConsensus {
1166        let len = consensus.bases.len();
1167
1168        for idx in 0..len {
1169            // Mask single-strand regions first
1170            // Single-strand positions are where one padded consensus has 'N'
1171            // This matches fgbio's check: if (a(i) == 'N' || b(i) == 'N')
1172            let a_is_n = padded_r1.bases.get(idx).copied().unwrap_or(NO_CALL_BASE) == NO_CALL_BASE;
1173            let b_is_n = padded_r2.bases.get(idx).copied().unwrap_or(NO_CALL_BASE) == NO_CALL_BASE;
1174
1175            if (a_is_n || b_is_n) && consensus.bases[idx] != NO_CALL_BASE {
1176                // This is a single-strand position with a base
1177                if let Some(ss_qual) = self.options.single_strand_qual {
1178                    consensus.quals[idx] = ss_qual;
1179                }
1180            }
1181
1182            // Mask outer bases
1183            if let Some(outer_qual) = self.options.outer_bases_qual {
1184                let outer_len = self.options.outer_bases_length;
1185                if idx < outer_len || idx >= len.saturating_sub(outer_len) {
1186                    consensus.quals[idx] = min(consensus.quals[idx], outer_qual);
1187                }
1188            }
1189        }
1190
1191        consensus
1192    }
1193
1194    /// Builds the output record from the consensus and writes raw bytes into `output`.
1195    #[expect(
1196        clippy::unnecessary_wraps,
1197        reason = "Result return type kept for API consistency with other callers"
1198    )]
1199    #[expect(
1200        clippy::too_many_arguments,
1201        reason = "all_records needed separately from source_raws for UMI consensus"
1202    )]
1203    fn build_output_record_into(
1204        &mut self,
1205        output: &mut ConsensusOutput,
1206        consensus: &SingleStrandConsensus,
1207        ss_a: &SingleStrandConsensus,
1208        ss_b: &SingleStrandConsensus,
1209        umi: Option<&str>,
1210        source_raws: &[&[u8]],
1211        all_records: &[Vec<u8>],
1212    ) -> Result<()> {
1213        // Generate read name - use ':' delimiter to match fgbio format
1214        self.consensus_counter += 1;
1215        let read_name = if let Some(umi_str) = umi {
1216            format!("{}:{}", self.read_name_prefix, umi_str)
1217        } else {
1218            format!("{}:{}", self.read_name_prefix, self.consensus_counter)
1219        };
1220
1221        // Codec always outputs unmapped fragments
1222        let flag = flags::UNMAPPED;
1223
1224        self.bam_builder.build_record(
1225            read_name.as_bytes(),
1226            flag,
1227            &consensus.bases,
1228            &consensus.quals,
1229        );
1230
1231        // RG tag
1232        self.bam_builder.append_string_tag(b"RG", self.read_group_id.as_bytes());
1233
1234        // MI tag
1235        if let Some(umi_str) = umi {
1236            self.bam_builder.append_string_tag(b"MI", umi_str.as_bytes());
1237        }
1238
1239        // Duplex consensus tags (cD, cM, cE)
1240        // fgbio calculates these from totalDepths = ab.depths + ba.depths (sum of SS depths),
1241        // NOT from the duplex consensus depths
1242        let total_depths: Vec<i32> = ss_a
1243            .depths
1244            .iter()
1245            .zip(ss_b.depths.iter())
1246            .map(|(&a, &b)| i32::from(a) + i32::from(b))
1247            .collect();
1248        let total_depth = total_depths.iter().copied().max().unwrap_or(0);
1249        let min_depth = total_depths.iter().copied().min().unwrap_or(0);
1250        let total_errors: usize = consensus.errors.iter().map(|&e| e as usize).sum();
1251        let total_bases: i32 = total_depths.iter().sum();
1252        let error_rate =
1253            if total_bases > 0 { total_errors as f32 / total_bases as f32 } else { 0.0 };
1254
1255        self.bam_builder.append_int_tag(b"cD", total_depth);
1256        self.bam_builder.append_int_tag(b"cM", min_depth);
1257        self.bam_builder.append_float_tag(b"cE", error_rate);
1258
1259        // AB strand tags (aD, aM, aE)
1260        let a_max_depth = ss_a.depths.iter().map(|&d| i32::from(d)).max().unwrap_or(0);
1261        let a_min_depth = ss_a.depths.iter().map(|&d| i32::from(d)).min().unwrap_or(0);
1262        let a_total_errors: usize = ss_a.errors.iter().map(|&e| e as usize).sum();
1263        let a_total_bases: usize = ss_a.depths.iter().map(|&d| d as usize).sum();
1264        let a_error_rate =
1265            if a_total_bases > 0 { a_total_errors as f32 / a_total_bases as f32 } else { 0.0 };
1266
1267        self.bam_builder.append_int_tag(b"aD", a_max_depth);
1268        self.bam_builder.append_int_tag(b"aM", a_min_depth);
1269        self.bam_builder.append_float_tag(b"aE", a_error_rate);
1270
1271        // BA strand tags (bD, bM, bE)
1272        let b_max_depth = ss_b.depths.iter().map(|&d| i32::from(d)).max().unwrap_or(0);
1273        let b_min_depth = ss_b.depths.iter().map(|&d| i32::from(d)).min().unwrap_or(0);
1274        let b_total_errors: usize = ss_b.errors.iter().map(|&e| e as usize).sum();
1275        let b_total_bases: usize = ss_b.depths.iter().map(|&d| d as usize).sum();
1276        let b_error_rate =
1277            if b_total_bases > 0 { b_total_errors as f32 / b_total_bases as f32 } else { 0.0 };
1278
1279        self.bam_builder.append_int_tag(b"bD", b_max_depth);
1280        self.bam_builder.append_int_tag(b"bM", b_min_depth);
1281        self.bam_builder.append_float_tag(b"bE", b_error_rate);
1282
1283        // Per-base tags if enabled
1284        if self.options.produce_per_base_tags {
1285            // Per-base depth arrays - convert to signed integers to match fgbio/simplex/duplex
1286            let ad_array: Vec<i16> = ss_a.depths.iter().map(|&d| d as i16).collect();
1287            let bd_array: Vec<i16> = ss_b.depths.iter().map(|&d| d as i16).collect();
1288            self.bam_builder.append_i16_array_tag(b"ad", &ad_array);
1289            self.bam_builder.append_i16_array_tag(b"bd", &bd_array);
1290
1291            // Per-base error arrays - convert to signed integers to match fgbio/simplex/duplex
1292            let ae_array: Vec<i16> = ss_a.errors.iter().map(|&e| e as i16).collect();
1293            let be_array: Vec<i16> = ss_b.errors.iter().map(|&e| e as i16).collect();
1294            self.bam_builder.append_i16_array_tag(b"ae", &ae_array);
1295            self.bam_builder.append_i16_array_tag(b"be", &be_array);
1296
1297            // Single-strand consensus sequences
1298            self.bam_builder.append_string_tag(b"ac", &ss_a.bases);
1299            self.bam_builder.append_string_tag(b"bc", &ss_b.bases);
1300
1301            // Single-strand consensus qualities - Phred+33 encoded
1302            self.bam_builder.append_phred33_string_tag(b"aq", &ss_a.quals);
1303            self.bam_builder.append_phred33_string_tag(b"bq", &ss_b.quals);
1304        }
1305
1306        // Cell barcode tag - extract from first source read if configured
1307        if let Some(cell_tag) = &self.options.cell_tag {
1308            let cell_tag_bytes: [u8; 2] = [cell_tag.as_ref()[0], cell_tag.as_ref()[1]];
1309            for raw in source_raws {
1310                if let Some(cell_bc) = bam_fields::find_string_tag_in_record(raw, &cell_tag_bytes) {
1311                    if !cell_bc.is_empty() {
1312                        self.bam_builder.append_string_tag(&cell_tag_bytes, cell_bc);
1313                        break;
1314                    }
1315                }
1316            }
1317        }
1318
1319        // RX tag (UmiBases) - extract from ALL records in the MI group and build consensus.
1320        // This must use all_records (not just filtered source_raws) to match fgbio, which
1321        // includes reads excluded from sequence consensus (unmapped, non-FR, etc.) when
1322        // building the UMI consensus. Their UMI bases help resolve N's at ambiguous positions.
1323        let umis: Vec<String> = all_records
1324            .iter()
1325            .filter_map(|raw| {
1326                bam_fields::find_string_tag_in_record(raw, b"RX")
1327                    .and_then(|b| String::from_utf8(b.to_vec()).ok())
1328            })
1329            .collect();
1330
1331        if !umis.is_empty() {
1332            let consensus_umi = consensus_umis(&umis);
1333            if !consensus_umi.is_empty() {
1334                self.bam_builder.append_string_tag(b"RX", consensus_umi.as_bytes());
1335            }
1336        }
1337
1338        // Write the completed record with block_size prefix
1339        self.bam_builder.write_with_block_size(&mut output.data);
1340        output.count += 1;
1341
1342        Ok(())
1343    }
1344
1345    /// Rejects a count of records with the given reason (without storing the actual records)
1346    fn reject_records_count(&mut self, count: usize, reason: CallerRejectionReason) {
1347        *self.stats.rejection_reasons.entry(reason).or_insert(0) += count;
1348        self.stats.reads_filtered += count as u64;
1349    }
1350}
1351
1352/// Implementation of the `ConsensusCaller` trait for CODEC consensus calling.
1353///
1354/// This allows `CodecConsensusCaller` to be used polymorphically with other
1355/// consensus callers (e.g., `VanillaUmiConsensusCaller`, `DuplexConsensusCaller`).
1356impl ConsensusCaller for CodecConsensusCaller {
1357    fn consensus_reads(&mut self, records: Vec<Vec<u8>>) -> Result<ConsensusOutput> {
1358        let result = self.consensus_reads_raw(&records)?;
1359        // When a group fails to produce consensus, all its records are rejected
1360        if self.track_rejects && result.count == 0 && !records.is_empty() {
1361            self.rejected_reads.extend(records);
1362        }
1363        Ok(result)
1364    }
1365
1366    #[expect(
1367        clippy::cast_possible_truncation,
1368        reason = "read counts will not exceed usize on any supported platform"
1369    )]
1370    fn total_reads(&self) -> usize {
1371        self.stats.total_input_reads as usize
1372    }
1373
1374    #[expect(
1375        clippy::cast_possible_truncation,
1376        reason = "read counts will not exceed usize on any supported platform"
1377    )]
1378    fn total_filtered(&self) -> usize {
1379        self.stats.reads_filtered as usize
1380    }
1381
1382    #[expect(
1383        clippy::cast_possible_truncation,
1384        reason = "read counts will not exceed usize on any supported platform"
1385    )]
1386    fn consensus_reads_constructed(&self) -> usize {
1387        self.stats.consensus_reads_generated as usize
1388    }
1389
1390    #[expect(
1391        clippy::cast_possible_truncation,
1392        reason = "read counts will not exceed usize on any supported platform"
1393    )]
1394    fn statistics(&self) -> ConsensusCallingStats {
1395        ConsensusCallingStats {
1396            total_reads: self.stats.total_input_reads as usize,
1397            consensus_reads: self.stats.consensus_reads_generated as usize,
1398            filtered_reads: self.stats.reads_filtered as usize,
1399            rejection_reasons: self.stats.rejection_reasons.clone(),
1400        }
1401    }
1402
1403    fn log_statistics(&self) {
1404        let stats = &self.stats;
1405        log::info!("CODEC Consensus Calling Statistics:");
1406        log::info!("  Total input reads: {}", stats.total_input_reads);
1407        log::info!("  Consensus reads generated: {}", stats.consensus_reads_generated);
1408        log::info!("  Reads filtered: {}", stats.reads_filtered);
1409        log::info!("  Consensus bases emitted: {}", stats.consensus_bases_emitted);
1410        log::info!("  Duplex bases emitted: {}", stats.consensus_duplex_bases_emitted);
1411        log::info!("  Duplex disagreement rate: {:.4}%", stats.duplex_disagreement_rate() * 100.0);
1412        if !stats.rejection_reasons.is_empty() {
1413            log::info!("  Rejection reasons:");
1414            for (reason, count) in &stats.rejection_reasons {
1415                log::info!("    {reason:?}: {count}");
1416            }
1417        }
1418    }
1419}
1420
1421/// Bridge methods for encoding `RecordBuf` to raw bytes.
1422/// Used by tests (both unit and integration) to call the raw-byte pipeline.
1423impl CodecConsensusCaller {
1424    /// Encode a single `RecordBuf` to raw BAM bytes.
1425    fn record_buf_to_raw(rec: &noodles::sam::alignment::RecordBuf) -> Vec<u8> {
1426        use crate::vendored::bam_codec::encode_record_buf;
1427        use noodles::sam::header::record::value::Map;
1428        use noodles::sam::header::record::value::map::ReferenceSequence;
1429        use std::num::NonZeroUsize;
1430
1431        let mut header = noodles::sam::Header::default();
1432        // Add reference sequences so records with reference_sequence_id encode correctly
1433        for name in &["chr1", "chr2", "chr3"] {
1434            let rs = Map::<ReferenceSequence>::new(NonZeroUsize::new(1000).unwrap());
1435            header.reference_sequences_mut().insert(bstr::BString::from(*name), rs);
1436        }
1437
1438        let mut buf = Vec::new();
1439        encode_record_buf(&mut buf, &header, rec).expect("encode_record_buf failed");
1440        buf
1441    }
1442
1443    /// Encode `RecordBuf`s to raw bytes and delegate to `consensus_reads`.
1444    ///
1445    /// # Errors
1446    ///
1447    /// Returns an error if consensus calling fails on the provided records.
1448    #[expect(
1449        clippy::needless_pass_by_value,
1450        reason = "matches ConsensusCaller trait signature pattern"
1451    )]
1452    pub fn consensus_reads_from_sam_records(
1453        &mut self,
1454        recs: Vec<noodles::sam::alignment::RecordBuf>,
1455    ) -> Result<ConsensusOutput> {
1456        let raw_records: Vec<Vec<u8>> = recs.iter().map(Self::record_buf_to_raw).collect();
1457        self.consensus_reads(raw_records)
1458    }
1459}
1460
1461#[cfg(test)]
1462#[expect(
1463    clippy::must_use_candidate,
1464    clippy::needless_pass_by_value,
1465    clippy::cast_sign_loss,
1466    reason = "test-only methods use owned values and numeric casts"
1467)]
1468impl CodecConsensusCaller {
1469    /// Test-only wrapper: check if a `RecordBuf` is part of an FR pair.
1470    pub fn is_fr_pair(&self, rec: &noodles::sam::alignment::RecordBuf) -> bool {
1471        let raw = Self::record_buf_to_raw(rec);
1472        bam_fields::is_fr_pair_raw(&raw)
1473    }
1474
1475    /// Test-only wrapper: filter `RecordBuf`s to most common alignment.
1476    pub fn filter_to_most_common_alignment(
1477        &mut self,
1478        recs: Vec<noodles::sam::alignment::RecordBuf>,
1479    ) -> Vec<noodles::sam::alignment::RecordBuf> {
1480        let raws: Vec<Vec<u8>> = recs.iter().map(Self::record_buf_to_raw).collect();
1481        let infos: Vec<ClippedRecordInfo> =
1482            raws.iter().enumerate().map(|(i, raw)| Self::build_clipped_info(raw, i, 0)).collect();
1483        let filtered = self.filter_to_most_common_alignment_raw(infos);
1484        filtered.into_iter().map(|info| recs[info.raw_idx].clone()).collect()
1485    }
1486
1487    /// Test-only wrapper: `read_pos_at_ref_pos` on a `RecordBuf`.
1488    pub fn read_pos_at_ref_pos(
1489        rec: &noodles::sam::alignment::RecordBuf,
1490        ref_pos: usize,
1491        return_last_base_if_deleted: bool,
1492    ) -> Option<usize> {
1493        let raw = Self::record_buf_to_raw(rec);
1494        let cigar_ops = bam_fields::get_cigar_ops(&raw);
1495        let alignment_start = (bam_fields::pos(&raw) + 1) as usize; // 1-based
1496        bam_fields::read_pos_at_ref_pos_raw(
1497            &cigar_ops,
1498            alignment_start,
1499            ref_pos,
1500            return_last_base_if_deleted,
1501        )
1502    }
1503
1504    /// Test-only wrapper: `check_overlap_phase` on `RecordBuf`s.
1505    pub fn check_overlap_phase(
1506        &self,
1507        r1: &noodles::sam::alignment::RecordBuf,
1508        r2: &noodles::sam::alignment::RecordBuf,
1509        overlap_start: usize,
1510        overlap_end: usize,
1511    ) -> bool {
1512        let raw1 = Self::record_buf_to_raw(r1);
1513        let raw2 = Self::record_buf_to_raw(r2);
1514        let info1 = Self::build_clipped_info(&raw1, 0, 0);
1515        let info2 = Self::build_clipped_info(&raw2, 1, 0);
1516        Self::check_overlap_phase_raw(&info1, &info2, overlap_start, overlap_end)
1517    }
1518
1519    /// Test-only wrapper: `to_source_read_for_codec` on a `RecordBuf`.
1520    pub(crate) fn to_source_read_for_codec(
1521        rec: &noodles::sam::alignment::RecordBuf,
1522        original_idx: usize,
1523    ) -> SourceRead {
1524        let raw = Self::record_buf_to_raw(rec);
1525        Self::to_source_read_for_codec_raw(&raw, original_idx, 0, false, None)
1526    }
1527
1528    /// Test-only wrapper: `compute_codec_consensus_length` on two `RecordBuf`s.
1529    pub fn compute_codec_consensus_length(
1530        pos_rec: &noodles::sam::alignment::RecordBuf,
1531        neg_rec: &noodles::sam::alignment::RecordBuf,
1532        overlap_end: usize,
1533    ) -> Option<usize> {
1534        let raw_pos = Self::record_buf_to_raw(pos_rec);
1535        let raw_neg = Self::record_buf_to_raw(neg_rec);
1536        let info_pos = Self::build_clipped_info(&raw_pos, 0, 0);
1537        let info_neg = Self::build_clipped_info(&raw_neg, 1, 0);
1538        Self::compute_consensus_length_raw(&info_pos, &info_neg, overlap_end)
1539    }
1540
1541    /// Test-only wrapper: downsample pairs using `max_reads_per_strand` option.
1542    pub fn downsample_pairs(
1543        &mut self,
1544        r1s: Vec<noodles::sam::alignment::RecordBuf>,
1545        r2s: Vec<noodles::sam::alignment::RecordBuf>,
1546    ) -> (Vec<noodles::sam::alignment::RecordBuf>, Vec<noodles::sam::alignment::RecordBuf>) {
1547        let Some(max_reads) = self.options.max_reads_per_strand else {
1548            return (r1s, r2s);
1549        };
1550        if r1s.len() <= max_reads {
1551            return (r1s, r2s);
1552        }
1553        let mut indices: Vec<usize> = (0..r1s.len()).collect();
1554        indices.shuffle(&mut self.rng);
1555        indices.truncate(max_reads);
1556        indices.sort_unstable();
1557        let new_r1: Vec<_> = indices.iter().map(|&i| r1s[i].clone()).collect();
1558        let new_r2: Vec<_> = indices.iter().map(|&i| r2s[i].clone()).collect();
1559        (new_r1, new_r2)
1560    }
1561}
1562
1563#[cfg(test)]
1564#[expect(
1565    clippy::similar_names,
1566    reason = "test variables use domain-specific names like r1/r2, ad/bd"
1567)]
1568mod tests {
1569    use super::*;
1570    use fgumi_raw_bam::ParsedBamRecord;
1571    use fgumi_sam::builder::RecordBuilder;
1572    use fgumi_sam::clipper::cigar_utils;
1573    use noodles::sam::alignment::RecordBuf;
1574    use noodles::sam::alignment::record::Cigar as CigarTrait;
1575    use noodles::sam::alignment::record::Flags;
1576    use noodles::sam::alignment::record::cigar::op::Kind;
1577
1578    #[test]
1579    fn test_codec_caller_creation() {
1580        let options = CodecConsensusOptions::default();
1581        let caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
1582
1583        assert_eq!(caller.read_name_prefix, "codec");
1584        assert_eq!(caller.read_group_id, "RG1");
1585    }
1586
1587    #[test]
1588    fn test_codec_options_defaults() {
1589        let options = CodecConsensusOptions::default();
1590        assert_eq!(options.min_reads_per_strand, 1);
1591        assert_eq!(options.min_duplex_length, 1);
1592        assert_eq!(options.min_input_base_quality, 10);
1593        assert_eq!(options.error_rate_pre_umi, 45);
1594        assert_eq!(options.error_rate_post_umi, 40);
1595    }
1596
1597    #[test]
1598    fn test_empty_input() {
1599        let options = CodecConsensusOptions::default();
1600        let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
1601
1602        let output = caller.consensus_reads_from_sam_records(Vec::new()).unwrap();
1603        assert_eq!(output.count, 0);
1604        assert_eq!(caller.stats.total_input_reads, 0);
1605    }
1606
1607    #[test]
1608    fn test_reverse_complement() {
1609        assert_eq!(reverse_complement(b"ACGT"), b"ACGT".to_vec());
1610        assert_eq!(reverse_complement(b"AAAA"), b"TTTT".to_vec());
1611        assert_eq!(reverse_complement(b"ACGTN"), b"NACGT".to_vec());
1612        assert_eq!(reverse_complement(b""), b"".to_vec());
1613    }
1614
1615    #[test]
1616    fn test_stats_duplex_disagreement_rate() {
1617        let mut stats = CodecConsensusStats::default();
1618        assert!(stats.duplex_disagreement_rate().abs() < f64::EPSILON);
1619
1620        stats.consensus_duplex_bases_emitted = 100;
1621        stats.duplex_disagreement_base_count = 5;
1622        assert!((stats.duplex_disagreement_rate() - 0.05).abs() < 1e-6);
1623    }
1624
1625    /// Helper function to create a test paired read
1626    #[expect(
1627        clippy::too_many_arguments,
1628        clippy::cast_possible_truncation,
1629        clippy::cast_possible_wrap,
1630        clippy::cast_sign_loss,
1631        reason = "test helper needs all parameters and uses casts for BAM position math"
1632    )]
1633    fn create_test_paired_read(
1634        name: &str,
1635        seq: &[u8],
1636        qual: &[u8],
1637        is_first: bool,
1638        is_reverse: bool,
1639        mate_is_reverse: bool,
1640        ref_start: usize,
1641        cigar_ops: &[(Kind, usize)],
1642    ) -> RecordBuf {
1643        // Convert CIGAR ops to string and calculate reference length
1644        use std::fmt::Write;
1645        let mut ref_len = 0usize;
1646        let cigar_str: String = cigar_ops.iter().fold(String::new(), |mut acc, (kind, len)| {
1647            // Calculate reference length consumed
1648            match kind {
1649                Kind::Match
1650                | Kind::SequenceMatch
1651                | Kind::SequenceMismatch
1652                | Kind::Deletion
1653                | Kind::Skip => {
1654                    ref_len += len;
1655                }
1656                _ => {}
1657            }
1658            let c = match kind {
1659                Kind::Match => 'M',
1660                Kind::Insertion => 'I',
1661                Kind::Deletion => 'D',
1662                Kind::SoftClip => 'S',
1663                Kind::HardClip => 'H',
1664                Kind::Skip => 'N',
1665                Kind::Pad => 'P',
1666                Kind::SequenceMatch => '=',
1667                Kind::SequenceMismatch => 'X',
1668            };
1669            let _ = write!(acc, "{len}{c}");
1670            acc
1671        });
1672
1673        // Calculate mate position and template length for FR pair detection
1674        // Use a typical insert size of 200bp
1675        let insert_size: i32 = 200;
1676        let (mate_start, tlen) = if is_reverse {
1677            // Read on reverse strand: mate should be before this read
1678            let mate_pos = (ref_start as i32 - insert_size + ref_len as i32).max(1) as usize;
1679            (mate_pos, -insert_size)
1680        } else {
1681            // Read on positive strand: mate should be after this read
1682            let mate_pos = ((ref_start as i32) + insert_size - (ref_len as i32)).max(1) as usize;
1683            (mate_pos, insert_size)
1684        };
1685
1686        let seq_str = String::from_utf8_lossy(seq);
1687        RecordBuilder::new()
1688            .name(name)
1689            .sequence(&seq_str)
1690            .qualities(qual)
1691            .reference_sequence_id(0)
1692            .alignment_start(ref_start)
1693            .cigar(&cigar_str)
1694            .first_segment(is_first)
1695            .properly_paired(true)
1696            .reverse_complement(is_reverse)
1697            .mate_reverse_complement(mate_is_reverse)
1698            .mate_reference_sequence_id(0)
1699            .mate_alignment_start(mate_start)
1700            .template_length(tlen)
1701            .tag("MI", "UMI123")
1702            .build()
1703    }
1704
1705    #[test]
1706    fn test_is_fr_pair() {
1707        use noodles::sam::alignment::record::cigar::op::Kind;
1708
1709        let options = CodecConsensusOptions::default();
1710        let caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
1711
1712        // FR pair: R1 forward, R2 reverse
1713        let r1_fr = create_test_paired_read(
1714            "read1",
1715            b"ACGT",
1716            b"####",
1717            true,
1718            false,
1719            true,
1720            100,
1721            &[(Kind::Match, 4)],
1722        );
1723        assert!(caller.is_fr_pair(&r1_fr));
1724
1725        // RF pair: R1 reverse, R2 forward - still considered FR orientation
1726        let r1_rf = create_test_paired_read(
1727            "read1",
1728            b"ACGT",
1729            b"####",
1730            true,
1731            true,
1732            false,
1733            100,
1734            &[(Kind::Match, 4)],
1735        );
1736        assert!(caller.is_fr_pair(&r1_rf));
1737
1738        // FF pair: both forward - not FR
1739        let r1_ff = create_test_paired_read(
1740            "read1",
1741            b"ACGT",
1742            b"####",
1743            true,
1744            false,
1745            false,
1746            100,
1747            &[(Kind::Match, 4)],
1748        );
1749        assert!(!caller.is_fr_pair(&r1_ff));
1750    }
1751
1752    #[test]
1753    fn test_simplify_cigar() {
1754        use noodles::sam::alignment::record::cigar::op::Kind;
1755
1756        // Test simple match
1757        let read = create_test_paired_read(
1758            "read",
1759            b"ACGTACGT",
1760            b"########",
1761            true,
1762            false,
1763            true,
1764            100,
1765            &[(Kind::Match, 8)],
1766        );
1767        assert_eq!(cigar_utils::simplify_cigar(read.cigar()), vec![(Kind::Match, 8)]);
1768
1769        // Test with indels
1770        let read_indel = create_test_paired_read(
1771            "read",
1772            b"ACGTACGT",
1773            b"########",
1774            true,
1775            false,
1776            true,
1777            100,
1778            &[(Kind::Match, 4), (Kind::Deletion, 2), (Kind::Match, 4)],
1779        );
1780        assert_eq!(
1781            cigar_utils::simplify_cigar(read_indel.cigar()),
1782            vec![(Kind::Match, 4), (Kind::Deletion, 2), (Kind::Match, 4)]
1783        );
1784
1785        // Test coalescing: soft clip + match should coalesce to match
1786        let read_softclip = create_test_paired_read(
1787            "read",
1788            b"ACGTACGT",
1789            b"########",
1790            true,
1791            false,
1792            true,
1793            100,
1794            &[(Kind::SoftClip, 2), (Kind::Match, 6)],
1795        );
1796        assert_eq!(
1797            cigar_utils::simplify_cigar(read_softclip.cigar()),
1798            vec![(Kind::Match, 8)] // Should coalesce S+M to M
1799        );
1800    }
1801
1802    #[test]
1803    fn test_filter_to_most_common_alignment() {
1804        use noodles::sam::alignment::record::cigar::op::Kind;
1805
1806        let options = CodecConsensusOptions::default();
1807        let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
1808
1809        // Create 3 reads with same CIGAR and 1 with different
1810        let reads = vec![
1811            create_test_paired_read(
1812                "r1",
1813                b"ACGT",
1814                b"####",
1815                true,
1816                false,
1817                true,
1818                100,
1819                &[(Kind::Match, 4)],
1820            ),
1821            create_test_paired_read(
1822                "r2",
1823                b"ACGT",
1824                b"####",
1825                true,
1826                false,
1827                true,
1828                100,
1829                &[(Kind::Match, 4)],
1830            ),
1831            create_test_paired_read(
1832                "r3",
1833                b"ACGT",
1834                b"####",
1835                true,
1836                false,
1837                true,
1838                100,
1839                &[(Kind::Match, 4)],
1840            ),
1841            create_test_paired_read(
1842                "r4",
1843                b"ACGT",
1844                b"####",
1845                true,
1846                false,
1847                true,
1848                100,
1849                &[(Kind::Match, 2), (Kind::Deletion, 1), (Kind::Match, 2)],
1850            ),
1851        ];
1852
1853        let filtered = caller.filter_to_most_common_alignment(reads);
1854        assert_eq!(filtered.len(), 3);
1855        assert_eq!(
1856            caller.stats.rejection_reasons.get(&CallerRejectionReason::MinorityAlignment),
1857            Some(&1)
1858        );
1859    }
1860
1861    /// Tests deterministic tie-breaking when two groups have equal sizes.
1862    /// With CIGAR-based tie-breaking, the group with the "smaller" CIGAR wins.
1863    /// CIGAR comparison is done element-by-element: first by length, then by operation kind.
1864    /// This matches fgbio's filterToMostCommonAlignment behavior with deterministic tie-breaking.
1865    #[test]
1866    fn test_filter_to_most_common_alignment_tie_breaking() {
1867        use noodles::sam::alignment::record::cigar::op::Kind;
1868
1869        let options = CodecConsensusOptions::default();
1870        let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
1871
1872        // Create two groups with equal sizes: 2 reads with 4M and 2 reads with 3M1D1M
1873        // Both have the same query length (4 bases).
1874        // With deterministic CIGAR-based tie-breaking:
1875        // - 4M = [(Match, 4)]
1876        // - 3M1D1M = [(Match, 3), (Deletion, 1), (Match, 1)]
1877        // Comparing first elements: length 3 < length 4, so 3M1D1M is "smaller" and wins.
1878        let reads = vec![
1879            create_test_paired_read(
1880                "r1_4M",
1881                b"ACGT",
1882                b"####",
1883                true,
1884                false,
1885                true,
1886                100,
1887                &[(Kind::Match, 4)],
1888            ),
1889            create_test_paired_read(
1890                "r2_4M",
1891                b"ACGT",
1892                b"####",
1893                true,
1894                false,
1895                true,
1896                100,
1897                &[(Kind::Match, 4)],
1898            ),
1899            create_test_paired_read(
1900                "r3_del",
1901                b"ACGT",
1902                b"####",
1903                true,
1904                false,
1905                true,
1906                100,
1907                &[(Kind::Match, 3), (Kind::Deletion, 1), (Kind::Match, 1)],
1908            ),
1909            create_test_paired_read(
1910                "r4_del",
1911                b"ACGT",
1912                b"####",
1913                true,
1914                false,
1915                true,
1916                100,
1917                &[(Kind::Match, 3), (Kind::Deletion, 1), (Kind::Match, 1)],
1918            ),
1919        ];
1920
1921        let filtered = caller.filter_to_most_common_alignment(reads);
1922        // Both groups have size 2, so we expect 2 reads to be returned
1923        assert_eq!(filtered.len(), 2);
1924        // The 3M1D1M group should be selected due to deterministic CIGAR tie-breaking
1925        // (smaller CIGAR wins, and 3 < 4 in the first element comparison)
1926        // Verify that the filtered reads are the 3M1D1M reads (r3_del and r4_del)
1927        for read in &filtered {
1928            let cigar = read.cigar();
1929            assert_eq!(cigar.as_ref().len(), 3, "Expected 3-op cigar (3M1D1M)");
1930            let ops: Vec<_> = cigar.iter().map(|r| r.unwrap()).collect();
1931            assert_eq!(ops[0].kind(), Kind::Match);
1932            assert_eq!(ops[0].len(), 3);
1933            assert_eq!(ops[1].kind(), Kind::Deletion);
1934            assert_eq!(ops[1].len(), 1);
1935            assert_eq!(ops[2].kind(), Kind::Match);
1936            assert_eq!(ops[2].len(), 1);
1937        }
1938        // 2 reads should be rejected
1939        assert_eq!(
1940            caller.stats.rejection_reasons.get(&CallerRejectionReason::MinorityAlignment),
1941            Some(&2)
1942        );
1943    }
1944
1945    /// Tests that CIGAR is reversed for negative strand reads before minority alignment filtering.
1946    /// This matches fgbio's toSourceReadForCodec behavior where the CIGAR is reversed for
1947    /// negative strand reads before filterToMostCommonAlignment.
1948    ///
1949    /// Without reversal:
1950    /// - Read A (3M1I2M): first element length = 3
1951    /// - Read B (2M1I3M): first element length = 2
1952    /// - 2 < 3, so Read B would win
1953    ///
1954    /// With reversal (both negative strand):
1955    /// - Read A reversed (2M1I3M): first element length = 2
1956    /// - Read B reversed (3M1I2M): first element length = 3
1957    /// - 2 < 3, so Read A wins
1958    #[test]
1959    fn test_filter_to_most_common_alignment_negative_strand_cigar_reversal() {
1960        use noodles::sam::alignment::record::cigar::op::Kind;
1961
1962        let options = CodecConsensusOptions::default();
1963        let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
1964
1965        // Create two groups with equal sizes, both on negative strand
1966        // The CIGAR reversal should affect which group wins the tie-breaker
1967        let reads = vec![
1968            // Group A: 3M1I2M on negative strand -> reversed to 2M1I3M
1969            create_test_paired_read(
1970                "r1_groupA",
1971                b"ACGTAC",
1972                b"######",
1973                true,
1974                true, // negative strand
1975                false,
1976                100,
1977                &[(Kind::Match, 3), (Kind::Insertion, 1), (Kind::Match, 2)],
1978            ),
1979            create_test_paired_read(
1980                "r2_groupA",
1981                b"ACGTAC",
1982                b"######",
1983                true,
1984                true, // negative strand
1985                false,
1986                100,
1987                &[(Kind::Match, 3), (Kind::Insertion, 1), (Kind::Match, 2)],
1988            ),
1989            // Group B: 2M1I3M on negative strand -> reversed to 3M1I2M
1990            create_test_paired_read(
1991                "r3_groupB",
1992                b"ACGTAC",
1993                b"######",
1994                true,
1995                true, // negative strand
1996                false,
1997                100,
1998                &[(Kind::Match, 2), (Kind::Insertion, 1), (Kind::Match, 3)],
1999            ),
2000            create_test_paired_read(
2001                "r4_groupB",
2002                b"ACGTAC",
2003                b"######",
2004                true,
2005                true, // negative strand
2006                false,
2007                100,
2008                &[(Kind::Match, 2), (Kind::Insertion, 1), (Kind::Match, 3)],
2009            ),
2010        ];
2011
2012        let filtered = caller.filter_to_most_common_alignment(reads);
2013
2014        // Both groups have size 2, so CIGAR-based tie-breaking kicks in
2015        assert_eq!(filtered.len(), 2);
2016
2017        // With CIGAR reversal for negative strand:
2018        // - Group A's 3M1I2M -> reversed to 2M1I3M (first element = 2)
2019        // - Group B's 2M1I3M -> reversed to 3M1I2M (first element = 3)
2020        // Since 2 < 3, Group A wins (the 3M1I2M reads)
2021        // Without reversal, Group B would win
2022        for read in &filtered {
2023            let cigar = read.cigar();
2024            let ops: Vec<_> = cigar.iter().map(|r| r.unwrap()).collect();
2025            assert_eq!(ops.len(), 3, "Expected 3-op cigar");
2026            // The original CIGAR should be 3M1I2M (Group A)
2027            assert_eq!(ops[0].kind(), Kind::Match);
2028            assert_eq!(ops[0].len(), 3, "Expected first element to be 3M (Group A won)");
2029            assert_eq!(ops[1].kind(), Kind::Insertion);
2030            assert_eq!(ops[1].len(), 1);
2031            assert_eq!(ops[2].kind(), Kind::Match);
2032            assert_eq!(ops[2].len(), 2);
2033        }
2034
2035        // 2 reads from Group B should be rejected
2036        assert_eq!(
2037            caller.stats.rejection_reasons.get(&CallerRejectionReason::MinorityAlignment),
2038            Some(&2)
2039        );
2040    }
2041
2042    // ==========================================================================
2043    // Integration tests ported from fgbio CodecConsensusCallerTest
2044    // ==========================================================================
2045
2046    /// Reference sequence for tests (from fgbio)
2047    const REF_BASES: &[u8] = b"TGGGTGTTGTTTGGGTTCCTGCTTAAAGAGCTACTGTTCTTCACAGAAACTTCCAACTCACCCAGACTGAGATTTGTACTGAGACTACGATCCACATGTTCAATATCTGATATCTGATGGGAAATAGGCTTTACTGAATTATCCATTTGGGCTGTAATTAATTTCAGTGATGAGCGGGAGATGTTGTTAGTTGTGCTCAGTAACTTTTTGATAGTAGCGGGAGTAGGAGTAAATCTTGTACTAATTAGTGAATATTCTGTTGATGGTGGCTGAAAATTTATAGCTACACAACCAAAAAAATAAAAAACGTTAGTCAATAGCATTTATAAATAGTCTTCTCTACCTGAAATATTTTACATTAAGTAATTCATTCCTTCATTTAGTATCTACACATGTCTAACATTGTAGTAGGAGCTGTGTACTAACAAGAAATCATGACACTGTTTCTGCCTTCAAGGAGCTTATAATCTTTTGGGGTACACAAGATAACCCAGAATGTTAAATAGTATAAAAGTCAAAGTACAATAATTTATTTCATTAAGATTTTGAAATGGCTAACAAACACCTGTTGATCACCTCATACACATGAGCCTCAAAACAAAGGAAAGCACAGCCCCTATGCCTGAGCAATTTAGAATATTGTCAAGGATAGAGACATGTGAGCCATTCACTATGAAACAATCATTGAGAACTACTACAAGAGTGATAAATATAAAATGAAACCTACAGAAACACAGAAGAGTAAGTAATTTTCCCTATAAAGAAGACAGGAACTAAATGTATAAGCAAAAATTGGGAAATTATATAAATGCTATTTTATATGAGAGGCAAAGAACCACAGGTCTAATAATTTTACAAATGTGATAAAATCAGATTTTATGTCCCCATCTTTCTTGACTGCTCAGCTAGAAATTAAAACATTTTTACACATCTTTTTGGCGGGGGCGGGGGGGATCATTATTTATTTCACCTGCCAAAATACTTCATTTCCTTATTGCACTTTTTTACTTCTTTGGTATGGAAAAATCTAACGGGTTTTAGAGTATGAACACATTTTAAGCAGTGATTAGATACGTTTTTCTTGTTATGCTTTCTATTGCAAATTTAGGATTTGATTTTGCACTGTCTTCATGCAAAGCTCTTCTCAAAGGTCTTAAAATATAAAAAACACTTAATGCTTCTCAAAGCATTAAGATTTTATGTAAATCAAACCAAAACCAGAAAAAGACAGAAGAAAATGAACCAAAAACAACAAAAATAATCCTTAACATAGTTGGCAACAAGTGCAATGAAAGATTTTT";
2048
2049    /// Helper to create an FR read pair similar to fgbio's SamBuilder.addPair
2050    ///
2051    /// Creates a properly oriented FR pair where:
2052    /// - R1 is `first_segment`, forward (unless strand1=Minus)
2053    /// - R2 is `last_segment`, reverse (unless strand2=Plus)
2054    /// - Both reads have proper mate information set
2055    #[expect(
2056        clippy::too_many_arguments,
2057        clippy::too_many_lines,
2058        clippy::cast_possible_truncation,
2059        clippy::cast_possible_wrap,
2060        reason = "test helper needs all parameters, many lines, and uses casts for BAM position math"
2061    )]
2062    fn create_fr_pair(
2063        name: &str,
2064        start1: usize,
2065        start2: usize,
2066        _read_length: usize,
2067        base_quality: u8,
2068        cigar1: &[(Kind, usize)],
2069        cigar2: &[(Kind, usize)],
2070        mi_tag: &str,
2071        rx_tag: Option<&str>,
2072        strand1_reverse: bool,
2073        strand2_reverse: bool,
2074    ) -> Vec<RecordBuf> {
2075        // Calculate reference end positions based on CIGAR
2076        let calc_ref_length = |cigar: &[(Kind, usize)]| -> usize {
2077            cigar
2078                .iter()
2079                .filter(|(k, _)| matches!(k, Kind::Match | Kind::Deletion | Kind::Skip))
2080                .map(|(_, len)| *len)
2081                .sum()
2082        };
2083
2084        let ref_len1 = calc_ref_length(cigar1);
2085        let ref_len2 = calc_ref_length(cigar2);
2086
2087        // Get sequence for R1 and R2 based on positions
2088        let get_sequence = |start: usize, cigar: &[(Kind, usize)]| -> Vec<u8> {
2089            let mut seq = Vec::new();
2090            let mut ref_pos = start - 1; // Convert to 0-based
2091            for &(kind, len) in cigar {
2092                match kind {
2093                    Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => {
2094                        // Consume both ref and read
2095                        let end = (ref_pos + len).min(REF_BASES.len());
2096                        if ref_pos < REF_BASES.len() {
2097                            seq.extend_from_slice(&REF_BASES[ref_pos..end]);
2098                            // Pad with A's if we ran past ref
2099                            let pad_len = (ref_pos + len).saturating_sub(end);
2100                            seq.resize(seq.len() + pad_len, b'A');
2101                        }
2102                        ref_pos += len;
2103                    }
2104                    Kind::Insertion | Kind::SoftClip => {
2105                        // Consume read only - add placeholder bases
2106                        seq.resize(seq.len() + len, b'A');
2107                    }
2108                    Kind::Deletion | Kind::Skip => {
2109                        // Consume ref only
2110                        ref_pos += len;
2111                    }
2112                    Kind::HardClip | Kind::Pad => {
2113                        // No change
2114                    }
2115                }
2116            }
2117            seq
2118        };
2119
2120        let seq1_fwd = get_sequence(start1, cigar1);
2121        let seq2_fwd = get_sequence(start2, cigar2);
2122
2123        // For a true FR pair sequencing the same DNA molecule from opposite strands:
2124        // - Positive strand reads are stored as-is in BAM (matches reference)
2125        // - Negative strand reads are stored as revcomp in BAM
2126        //   After revcomp during SS building, they become the same as reference
2127        // This ensures that both R1 and R2 SS consensuses agree in the overlap.
2128        let seq1: Vec<u8> = if strand1_reverse { reverse_complement(&seq1_fwd) } else { seq1_fwd };
2129        let seq2: Vec<u8> = if strand2_reverse { reverse_complement(&seq2_fwd) } else { seq2_fwd };
2130
2131        let qual1 = vec![base_quality; seq1.len()];
2132        let qual2 = vec![base_quality; seq2.len()];
2133
2134        // Convert CIGAR ops to strings
2135        let cigar_to_str = |cigar: &[(Kind, usize)]| -> String {
2136            use std::fmt::Write;
2137            cigar.iter().fold(String::new(), |mut acc, (kind, len)| {
2138                let c = match kind {
2139                    Kind::Match => 'M',
2140                    Kind::Insertion => 'I',
2141                    Kind::Deletion => 'D',
2142                    Kind::SoftClip => 'S',
2143                    Kind::HardClip => 'H',
2144                    Kind::Skip => 'N',
2145                    Kind::Pad => 'P',
2146                    Kind::SequenceMatch => '=',
2147                    Kind::SequenceMismatch => 'X',
2148                };
2149                let _ = write!(acc, "{len}{c}");
2150                acc
2151            })
2152        };
2153
2154        let cigar1_str = cigar_to_str(cigar1);
2155        let cigar2_str = cigar_to_str(cigar2);
2156
2157        // Calculate template length (insert size)
2158        // For FR pairs, template length should be positive for the leftmost read
2159        let template_len = if start1 <= start2 {
2160            (start2 + ref_len2 - start1) as i32
2161        } else {
2162            -((start1 + ref_len1 - start2) as i32)
2163        };
2164
2165        // Build R1
2166        let seq1_str = String::from_utf8_lossy(&seq1);
2167        let mut r1_builder = RecordBuilder::new()
2168            .name(name)
2169            .sequence(&seq1_str)
2170            .qualities(&qual1)
2171            .reference_sequence_id(0)
2172            .alignment_start(start1)
2173            .cigar(&cigar1_str)
2174            .first_segment(true)
2175            .properly_paired(true)
2176            .reverse_complement(strand1_reverse)
2177            .mate_reverse_complement(strand2_reverse)
2178            .mate_reference_sequence_id(0)
2179            .mate_alignment_start(start2)
2180            .template_length(template_len)
2181            .tag("MI", mi_tag);
2182
2183        if let Some(rx) = rx_tag {
2184            r1_builder = r1_builder.tag("RX", rx);
2185        }
2186        let r1 = r1_builder.build();
2187
2188        // Build R2
2189        let seq2_str = String::from_utf8_lossy(&seq2);
2190        let mut r2_builder = RecordBuilder::new()
2191            .name(name)
2192            .sequence(&seq2_str)
2193            .qualities(&qual2)
2194            .reference_sequence_id(0)
2195            .alignment_start(start2)
2196            .cigar(&cigar2_str)
2197            .first_segment(false)
2198            .properly_paired(true)
2199            .reverse_complement(strand2_reverse)
2200            .mate_reverse_complement(strand1_reverse)
2201            .mate_reference_sequence_id(0)
2202            .mate_alignment_start(start1)
2203            .template_length(-template_len)
2204            .tag("MI", mi_tag);
2205
2206        if let Some(rx) = rx_tag {
2207            r2_builder = r2_builder.tag("RX", rx);
2208        }
2209        let r2 = r2_builder.build();
2210
2211        vec![r1, r2]
2212    }
2213
2214    /// Port of fgbio test: "make a consensus from two simple reads"
2215    ///
2216    /// Tests that a simple FR pair produces a consensus with correct structure.
2217    /// Note: Positions covered by only one strand get N bases (correct duplex behavior),
2218    /// so we don't check exact sequence content - only that consensus structure is correct.
2219    #[test]
2220    fn test_make_consensus_from_simple_reads() {
2221        let options = CodecConsensusOptions {
2222            min_reads_per_strand: 1,
2223            min_duplex_length: 1,
2224            ..Default::default()
2225        };
2226        let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2227
2228        // Create FR pair: R1 at pos 1, R2 at pos 11, both 30bp
2229        let reads = create_fr_pair(
2230            "read1",
2231            1,
2232            11,
2233            30,
2234            35,
2235            &[(Kind::Match, 30)],
2236            &[(Kind::Match, 30)],
2237            "hi",
2238            Some("ACC-TGA"),
2239            false, // R1 forward
2240            true,  // R2 reverse
2241        );
2242
2243        let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2244
2245        assert_eq!(output.count, 1, "Should produce one consensus read");
2246
2247        let records = ParsedBamRecord::parse_all(&output.data);
2248        let consensus = &records[0];
2249        // Check read name format
2250        let name = String::from_utf8_lossy(&consensus.name);
2251        assert!(name.contains("hi"), "Consensus name should contain MI tag: {name}");
2252
2253        // Consensus should cover from pos 1 to pos 40 (R1: 1-30, R2: 11-40)
2254        assert_eq!(consensus.bases.len(), 40, "Consensus should be 40bp");
2255
2256        // Check RX tag is preserved (UmiBases consensus)
2257        let rx = consensus.get_string_tag(b"RX").expect("RX tag not found in consensus");
2258        assert_eq!(&rx[..], b"ACC-TGA", "RX tag should be preserved");
2259    }
2260
2261    /// Port of fgbio test: "make a consensus where R1 has a deletion outside of the overlap region"
2262    #[test]
2263    fn test_make_consensus_r1_deletion() {
2264        let options = CodecConsensusOptions {
2265            min_reads_per_strand: 1,
2266            min_duplex_length: 1,
2267            ..Default::default()
2268        };
2269        let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2270
2271        // R1 at pos 1 with 5M2D25M, R2 at pos 13 with 30M
2272        let reads = create_fr_pair(
2273            "read1",
2274            1,
2275            13,
2276            30,
2277            35,
2278            &[(Kind::Match, 5), (Kind::Deletion, 2), (Kind::Match, 25)],
2279            &[(Kind::Match, 30)],
2280            "hi",
2281            Some("ACC-TGA"),
2282            false,
2283            true,
2284        );
2285
2286        let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2287
2288        // Should produce consensus
2289        assert_eq!(output.count, 1, "Should produce one consensus read");
2290
2291        // Consensus should cover the region with deletion accounted for
2292        let records = ParsedBamRecord::parse_all(&output.data);
2293        let consensus = &records[0];
2294        assert!(!consensus.bases.is_empty(), "Should have sequence");
2295    }
2296
2297    /// Port of fgbio test: "not emit a consensus when the reads are an RF pair"
2298    #[test]
2299    fn test_not_emit_consensus_for_rf_pair() {
2300        let options = CodecConsensusOptions {
2301            min_reads_per_strand: 1,
2302            min_duplex_length: 1,
2303            ..Default::default()
2304        };
2305        let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2306
2307        // RF pair: R1 reverse at pos 100, R2 forward at pos 135
2308        let reads = create_fr_pair(
2309            "read1",
2310            100,
2311            135,
2312            30,
2313            35,
2314            &[(Kind::Match, 30)],
2315            &[(Kind::Match, 30)],
2316            "hi",
2317            Some("ACC-TGA"),
2318            true,  // R1 reverse
2319            false, // R2 forward
2320        );
2321
2322        let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2323
2324        assert_eq!(output.count, 0, "Should not emit consensus for RF pair");
2325    }
2326
2327    /// Port of fgbio test: "not emit a consensus when there are insufficient reads"
2328    #[test]
2329    fn test_not_emit_consensus_insufficient_reads() {
2330        // With minReadsPerStrand=2, a single pair should fail
2331        let options2 = CodecConsensusOptions {
2332            min_reads_per_strand: 2,
2333            min_duplex_length: 1,
2334            ..Default::default()
2335        };
2336        let mut caller2 =
2337            CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options2);
2338
2339        let reads = create_fr_pair(
2340            "read1",
2341            1,
2342            11,
2343            30,
2344            35,
2345            &[(Kind::Match, 30)],
2346            &[(Kind::Match, 30)],
2347            "hi",
2348            Some("ACC-TGA"),
2349            false,
2350            true,
2351        );
2352
2353        let output = caller2.consensus_reads_from_sam_records(reads).unwrap();
2354
2355        assert_eq!(output.count, 0, "Should not emit consensus with insufficient reads");
2356
2357        // With minReadsPerStrand=1, it should succeed
2358        let options1 = CodecConsensusOptions {
2359            min_reads_per_strand: 1,
2360            min_duplex_length: 1,
2361            ..Default::default()
2362        };
2363        let mut caller1 =
2364            CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options1);
2365
2366        let reads = create_fr_pair(
2367            "read1",
2368            1,
2369            11,
2370            30,
2371            35,
2372            &[(Kind::Match, 30)],
2373            &[(Kind::Match, 30)],
2374            "hi",
2375            Some("ACC-TGA"),
2376            false,
2377            true,
2378        );
2379
2380        let output = caller1.consensus_reads_from_sam_records(reads).unwrap();
2381
2382        assert_eq!(output.count, 1, "Should emit consensus with minReadsPerStrand=1");
2383    }
2384
2385    /// Port of fgbio test: "not emit a consensus when there is insufficient overlap between R1 and R2"
2386    #[test]
2387    fn test_not_emit_consensus_insufficient_overlap() {
2388        // R1 at 1-30, R2 at 11-40 gives 20bp overlap
2389        // With minDuplexLength=20, should pass
2390        let options20 = CodecConsensusOptions {
2391            min_reads_per_strand: 1,
2392            min_duplex_length: 20,
2393            ..Default::default()
2394        };
2395        let mut caller20 =
2396            CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options20);
2397
2398        let reads = create_fr_pair(
2399            "read1",
2400            1,
2401            11,
2402            30,
2403            35,
2404            &[(Kind::Match, 30)],
2405            &[(Kind::Match, 30)],
2406            "hi",
2407            Some("ACC-TGA"),
2408            false,
2409            true,
2410        );
2411
2412        let output = caller20.consensus_reads_from_sam_records(reads).unwrap();
2413        assert_eq!(
2414            output.count, 1,
2415            "Should emit consensus with minDuplexLength=20 and 20bp overlap"
2416        );
2417
2418        // With minDuplexLength=21, should fail
2419        let options21 = CodecConsensusOptions {
2420            min_reads_per_strand: 1,
2421            min_duplex_length: 21,
2422            ..Default::default()
2423        };
2424        let mut caller21 =
2425            CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options21);
2426
2427        let reads = create_fr_pair(
2428            "read1",
2429            1,
2430            11,
2431            30,
2432            35,
2433            &[(Kind::Match, 30)],
2434            &[(Kind::Match, 30)],
2435            "hi",
2436            Some("ACC-TGA"),
2437            false,
2438            true,
2439        );
2440
2441        let output = caller21.consensus_reads_from_sam_records(reads).unwrap();
2442        assert_eq!(
2443            output.count, 0,
2444            "Should not emit consensus with minDuplexLength=21 and 20bp overlap"
2445        );
2446    }
2447
2448    /// Port of fgbio test: "not emit a consensus when the read pair has one mate unmapped"
2449    #[test]
2450    fn test_not_emit_consensus_unmapped_mate() {
2451        let options = CodecConsensusOptions {
2452            min_reads_per_strand: 1,
2453            min_duplex_length: 1,
2454            ..Default::default()
2455        };
2456        let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2457
2458        // Create FR pair then mark R2 as unmapped
2459        let mut reads = create_fr_pair(
2460            "read1",
2461            1,
2462            11,
2463            30,
2464            35,
2465            &[(Kind::Match, 30)],
2466            &[(Kind::Match, 30)],
2467            "hi",
2468            Some("ACC-TGA"),
2469            false,
2470            true,
2471        );
2472
2473        // Mark R2 as unmapped
2474        let r2 = &mut reads[1];
2475        *r2.flags_mut() = r2.flags() | Flags::UNMAPPED;
2476
2477        let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2478
2479        assert_eq!(output.count, 0, "Should not emit consensus when mate is unmapped");
2480    }
2481
2482    /// Port of fgbio test: "emit the consensus in the orientation of R1"
2483    ///
2484    /// This test verifies that the consensus output orientation follows R1.
2485    ///
2486    /// Tests that a forward R1 produces a consensus with the expected length.
2487    /// Note: Positions covered by only one strand get N bases (correct duplex behavior).
2488    #[test]
2489    fn test_emit_consensus_in_r1_orientation() {
2490        // Test that forward R1 produces consensus with correct length
2491        let options = CodecConsensusOptions {
2492            min_reads_per_strand: 1,
2493            min_duplex_length: 1,
2494            ..Default::default()
2495        };
2496        let mut caller_fwd =
2497            CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options.clone());
2498
2499        // Forward R1 (start=1), reverse R2 (start=11)
2500        let reads_fwd = create_fr_pair(
2501            "read1",
2502            1,
2503            11,
2504            30,
2505            35,
2506            &[(Kind::Match, 30)],
2507            &[(Kind::Match, 30)],
2508            "hi",
2509            Some("ACC-TGA"),
2510            false, // R1 forward
2511            true,  // R2 reverse
2512        );
2513
2514        let output_fwd = caller_fwd.consensus_reads_from_sam_records(reads_fwd).unwrap();
2515        assert_eq!(output_fwd.count, 1, "Should produce one consensus read");
2516
2517        let records_fwd = ParsedBamRecord::parse_all(&output_fwd.data);
2518        // Consensus should cover from pos 1 to pos 40 (R1: 1-30, R2: 11-40)
2519        assert_eq!(records_fwd[0].bases.len(), 40, "Forward R1 should produce 40bp consensus");
2520
2521        // Verify orientation flag is set correctly (forward R1 = unmapped flag only, not reverse)
2522        assert_eq!(
2523            records_fwd[0].flag & flags::REVERSE,
2524            0,
2525            "Forward R1 should produce forward-oriented consensus"
2526        );
2527    }
2528
2529    /// Port of fgbio test: "not emit a consensus when there are a lot of disagreements between strands"
2530    ///
2531    /// Tests disagreement filtering: with permissive settings consensus is produced,
2532    /// but with strict settings, consensus fails due to strand disagreements at
2533    /// non-overlapping positions (where one strand has N).
2534    #[test]
2535    fn test_not_emit_consensus_high_disagreement() {
2536        // First test that we get consensus with permissive disagreement settings
2537        let options_permissive = CodecConsensusOptions {
2538            min_reads_per_strand: 1,
2539            min_duplex_length: 1,
2540            max_duplex_disagreements: 100,
2541            max_duplex_disagreement_rate: 1.0,
2542            ..Default::default()
2543        };
2544        let mut caller_permissive =
2545            CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options_permissive);
2546
2547        // Use positions within REF_BASES bounds (positions 1-40)
2548        let reads = create_fr_pair(
2549            "read1",
2550            1,
2551            11,
2552            30,
2553            35,
2554            &[(Kind::Match, 30)],
2555            &[(Kind::Match, 30)],
2556            "hi",
2557            Some("ACC-TGA"),
2558            false,
2559            true,
2560        );
2561
2562        let output = caller_permissive.consensus_reads_from_sam_records(reads).unwrap();
2563        assert_eq!(output.count, 1, "Should emit consensus with permissive settings");
2564
2565        // Now test with strict disagreement limits - should fail because
2566        // positions covered by only one strand (10 on each side) count as disagreements
2567        let options_strict = CodecConsensusOptions {
2568            min_reads_per_strand: 1,
2569            min_duplex_length: 1,
2570            max_duplex_disagreements: 5, // Only allow 5 disagreements
2571            max_duplex_disagreement_rate: 0.05, // 5% rate
2572            ..Default::default()
2573        };
2574        let mut caller_strict =
2575            CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options_strict);
2576
2577        // Use positions within REF_BASES bounds
2578        let reads2 = create_fr_pair(
2579            "read1",
2580            1,
2581            11,
2582            30,
2583            35,
2584            &[(Kind::Match, 30)],
2585            &[(Kind::Match, 30)],
2586            "hi",
2587            Some("ACC-TGA"),
2588            false,
2589            true,
2590        );
2591
2592        // With strict settings, this should fail due to high disagreement
2593        // (positions covered by only one strand count as disagreements)
2594        let cons2 = caller_strict.consensus_reads_from_sam_records(reads2);
2595        assert!(cons2.is_err(), "Should reject consensus with strict disagreement settings");
2596    }
2597
2598    /// Port of fgbio test: "make a consensus where R2 has a deletion outside of the overlap region"
2599    ///
2600    /// fgumi now matches fgbio behavior: produces a 40bp consensus (skipping the deleted region).
2601    #[test]
2602    fn test_make_consensus_r2_deletion() {
2603        let options = CodecConsensusOptions {
2604            min_reads_per_strand: 1,
2605            min_duplex_length: 1,
2606            ..Default::default()
2607        };
2608        let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2609
2610        // R1 at pos 1 with 30M, R2 at pos 11 with 25M5D5M
2611        // In fgbio: start1=1, start2=11, cigar1="30M", cigar2="25M5D5M"
2612        // R2 has a 5bp deletion outside the overlap region
2613        let reads = create_fr_pair(
2614            "read1",
2615            1,
2616            11,
2617            30,
2618            35,
2619            &[(Kind::Match, 30)],
2620            &[(Kind::Match, 25), (Kind::Deletion, 5), (Kind::Match, 5)],
2621            "hi",
2622            Some("ACC-TGA"),
2623            false,
2624            true,
2625        );
2626
2627        let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2628
2629        // Should produce consensus
2630        assert_eq!(output.count, 1, "Should produce one consensus read");
2631
2632        // fgbio produces 40bp consensus (skipping the deleted region in R2)
2633        // fgumi now matches this behavior by filtering out positions with depth=0
2634        let records = ParsedBamRecord::parse_all(&output.data);
2635        let consensus = &records[0];
2636        let len = consensus.bases.len();
2637        assert_eq!(len, 40, "Consensus should be 40bp (skipping the 5bp deletion), got {len}");
2638    }
2639
2640    /// Port of fgbio test: "make a consensus where the reads have soft-clipping outside of the overlap region"
2641    #[test]
2642    fn test_make_consensus_soft_clipping() {
2643        let options = CodecConsensusOptions {
2644            min_reads_per_strand: 1,
2645            min_duplex_length: 1,
2646            ..Default::default()
2647        };
2648        let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2649
2650        // R1 at pos 1 with 5S25M (5bp soft-clip at start)
2651        // R2 at pos 11 with 25M5S (5bp soft-clip at end)
2652        // This creates a situation where:
2653        // - R1 aligned region: 1-25 (25bp aligned, 5bp soft-clipped at start)
2654        // - R2 aligned region: 11-35 (25bp aligned, 5bp soft-clipped at end)
2655        // - Overlap: 11-25 (15bp)
2656        // - Output should include soft-clipped bases from both ends
2657        let reads = create_fr_pair(
2658            "read1",
2659            1,
2660            11,
2661            30,
2662            35,
2663            &[(Kind::SoftClip, 5), (Kind::Match, 25)],
2664            &[(Kind::Match, 25), (Kind::SoftClip, 5)],
2665            "hi",
2666            Some("ACC-TGA"),
2667            false,
2668            true,
2669        );
2670
2671        let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2672
2673        // Should produce consensus
2674        assert_eq!(output.count, 1, "Should produce one consensus read");
2675
2676        // fgbio expects length 45: 5 (R1 soft-clip) + 35 (REF[0..35]) + 5 (R2 soft-clip)
2677        let records = ParsedBamRecord::parse_all(&output.data);
2678        let consensus = &records[0];
2679        let len = consensus.bases.len();
2680        // Currently fgumi doesn't include soft-clips, so length is just the reference span
2681        // TODO: Update to expect 45 when soft-clip handling is implemented
2682        assert_eq!(
2683            len, 45,
2684            "Consensus should be 45bp (5 soft-clip + 35 ref + 5 soft-clip), got {len}"
2685        );
2686    }
2687
2688    /// Port of fgbio test: "make a consensus where both reads are soft-clipped at the same end and fully overlapped"
2689    #[test]
2690    fn test_make_consensus_both_soft_clipped_same_end() {
2691        let options = CodecConsensusOptions {
2692            min_reads_per_strand: 1,
2693            min_duplex_length: 1,
2694            ..Default::default()
2695        };
2696        let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2697
2698        // Both R1 and R2 at pos 1 with 5S25M (both soft-clipped at start)
2699        // They are fully overlapped
2700        // In fgbio, R2's bases are set to match R1's soft-clipped sequence
2701        let reads = create_fr_pair(
2702            "read1",
2703            1,
2704            1,
2705            30,
2706            35,
2707            &[(Kind::SoftClip, 5), (Kind::Match, 25)],
2708            &[(Kind::SoftClip, 5), (Kind::Match, 25)],
2709            "hi",
2710            Some("ACC-TGA"),
2711            false,
2712            true,
2713        );
2714
2715        let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2716
2717        // Should produce consensus
2718        assert_eq!(output.count, 1, "Should produce one consensus read");
2719
2720        // fgbio expects length 30: 5 (soft-clip) + 25 (aligned)
2721        let records = ParsedBamRecord::parse_all(&output.data);
2722        let consensus = &records[0];
2723        assert!(!consensus.bases.is_empty(), "Should have sequence with both reads soft-clipped");
2724    }
2725
2726    /// Port of fgbio test: "not emit a consensus when the reads are a cross-chromosomal chimeric pair"
2727    #[test]
2728    fn test_not_emit_consensus_chimeric_pair() {
2729        let options = CodecConsensusOptions {
2730            min_reads_per_strand: 1,
2731            min_duplex_length: 1,
2732            ..Default::default()
2733        };
2734        let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2735
2736        // Create FR pair then make it chimeric by putting R1 on a different chromosome
2737        let mut reads = create_fr_pair(
2738            "read1",
2739            100,
2740            135,
2741            30,
2742            35,
2743            &[(Kind::Match, 30)],
2744            &[(Kind::Match, 30)],
2745            "hi",
2746            Some("ACC-TGA"),
2747            false,
2748            true,
2749        );
2750
2751        // Modify R1 to be on a different reference (chromosome)
2752        // In noodles, reference_sequence_id is Option<usize>
2753        *reads[0].reference_sequence_id_mut() = Some(2); // Different chromosome
2754        // Update mate's reference for consistency
2755        *reads[1].mate_reference_sequence_id_mut() = Some(2);
2756
2757        let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2758
2759        // Should not emit consensus for chimeric pairs
2760        assert_eq!(
2761            output.count, 0,
2762            "Should not emit consensus for cross-chromosomal chimeric pair"
2763        );
2764    }
2765
2766    /// Port of fgbio test: "not emit a consensus when R1's end lands in an indel in R2"
2767    ///
2768    /// fgumi now matches fgbio behavior: rejects pairs where overlap boundary lands in an indel.
2769    #[test]
2770    fn test_not_emit_consensus_r1_end_in_indel() {
2771        let options = CodecConsensusOptions {
2772            min_reads_per_strand: 1,
2773            min_duplex_length: 1,
2774            ..Default::default()
2775        };
2776        let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2777
2778        // R1 at pos 1 with 30M
2779        // R2 at pos 11 with 19M2D11M
2780        // R1 ends at position 30, which lands inside R2's deletion at position 30-31
2781        let reads = create_fr_pair(
2782            "read1",
2783            1,
2784            11,
2785            30,
2786            35,
2787            &[(Kind::Match, 30)],
2788            &[(Kind::Match, 19), (Kind::Deletion, 2), (Kind::Match, 11)],
2789            "hi",
2790            Some("ACC-TGA"),
2791            false,
2792            true,
2793        );
2794
2795        let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2796
2797        // fgbio rejects this because R1's end lands in an indel in R2
2798        // fgumi now implements this check and should also reject
2799        assert_eq!(
2800            output.count, 0,
2801            "Should not emit consensus when R1's end lands in an indel in R2"
2802        );
2803    }
2804
2805    /// Port of fgbio test: "mask end qualities"
2806    #[test]
2807    fn test_mask_end_qualities() {
2808        // This tests the outer_bases_qual and outer_bases_length options
2809        // Create a consensus with outerBasesLength=7 and outerBasesQual=5
2810        let options = CodecConsensusOptions {
2811            min_reads_per_strand: 1,
2812            min_duplex_length: 1,
2813            outer_bases_length: 7,
2814            outer_bases_qual: Some(5),
2815            ..Default::default()
2816        };
2817        let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2818
2819        // Create a simple FR pair with high quality bases
2820        let reads = create_fr_pair(
2821            "read1",
2822            1,
2823            1,
2824            50,
2825            90, // High base quality
2826            &[(Kind::Match, 50)],
2827            &[(Kind::Match, 50)],
2828            "hi",
2829            Some("ACC-TGA"),
2830            false,
2831            true,
2832        );
2833
2834        let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2835
2836        assert_eq!(output.count, 1, "Should produce one consensus read");
2837
2838        let records = ParsedBamRecord::parse_all(&output.data);
2839        let consensus = &records[0];
2840        let quals = &consensus.quals;
2841
2842        // Check that outer bases have reduced quality
2843        // First 7 bases should have qual <= 5
2844        for (i, &q) in quals.iter().take(7).enumerate() {
2845            assert!(q <= 5, "First 7 bases should have qual <= 5, but base {i} has qual {q}");
2846        }
2847
2848        // Last 7 bases should also have qual <= 5
2849        let len = quals.len();
2850        for (i, &q) in quals.iter().skip(len.saturating_sub(7)).enumerate() {
2851            assert!(q <= 5, "Last 7 bases should have qual <= 5, but base {i} has qual {q}");
2852        }
2853    }
2854
2855    /// Port of fgbio test: "mask single stranded regions _and_ bases"
2856    #[test]
2857    fn test_mask_single_stranded_regions() {
2858        // This tests the single_strand_qual option
2859        // In fgbio, single-stranded bases (where abConsensus has N/qual=2) get masked
2860        let options = CodecConsensusOptions {
2861            min_reads_per_strand: 1,
2862            min_duplex_length: 1,
2863            single_strand_qual: Some(4),
2864            ..Default::default()
2865        };
2866        let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2867
2868        // Create a pair where R1 and R2 don't fully overlap
2869        // R1: pos 1-30, R2: pos 20-49
2870        // Overlap region: 20-30 (duplex)
2871        // Single-strand regions: 1-19 (R1 only), 31-49 (R2 only)
2872        let reads = create_fr_pair(
2873            "read1",
2874            1,
2875            20,
2876            30,
2877            90, // High base quality
2878            &[(Kind::Match, 30)],
2879            &[(Kind::Match, 30)],
2880            "hi",
2881            Some("ACC-TGA"),
2882            false,
2883            true,
2884        );
2885
2886        let output = caller.consensus_reads_from_sam_records(reads).unwrap();
2887
2888        assert_eq!(output.count, 1, "Should produce one consensus read");
2889
2890        // The consensus should have single-strand regions masked
2891        // The exact quality values depend on the implementation details
2892        // This test verifies the option is respected
2893        let records = ParsedBamRecord::parse_all(&output.data);
2894        let consensus = &records[0];
2895        let quals = &consensus.quals;
2896
2897        // Single-strand regions should have lower quality
2898        // In fgbio, these get masked to single_strand_qual (4)
2899        // Check that we have some variation in qualities
2900        let has_low_qual = quals.iter().any(|&q| q <= 4);
2901        assert!(
2902            has_low_qual || quals.is_empty(),
2903            "Should have some low-quality single-strand regions (or be filtered)"
2904        );
2905    }
2906
2907    /// Test that uppercase 'N' (`NoCall`) from one strand masks the result to N,
2908    /// but lowercase 'n' (padding) does not mask the result.
2909    /// This matches fgbio's behavior where only uppercase 'N' is considered `NoCall`.
2910    #[test]
2911    fn test_nocall_masking_uppercase_n() {
2912        let options = CodecConsensusOptions::default();
2913        let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
2914
2915        // Create two single-strand consensuses:
2916        // Position 0: Both have valid bases (A, A) - should agree
2917        // Position 1: Strand A has 'N' (uppercase NoCall), strand B has 'T' - should mask to N
2918        // Position 2: Strand A has 'A', strand B has 'N' (uppercase NoCall) - should mask to N
2919        // Position 3: Strand A has 'n' (lowercase padding), strand B has 'G' - should NOT mask, use G
2920        // Position 4: Strand A has 'C', strand B has 'n' (lowercase padding) - should NOT mask, use C
2921        let ss_a = SingleStrandConsensus {
2922            bases: b"ANAnC".to_vec(),
2923            quals: vec![40, 2, 40, 0, 40], // N has qual 2 (NoCallQual), n has qual 0 (padding)
2924            depths: vec![2, 0, 2, 0, 2],
2925            errors: vec![0, 0, 0, 0, 0],
2926            raw_read_count: 2,
2927            ref_start: 0,
2928            ref_end: 4,
2929            is_negative_strand: false,
2930        };
2931
2932        let ss_b = SingleStrandConsensus {
2933            bases: b"ATNGn".to_vec(),
2934            quals: vec![40, 40, 2, 40, 0], // N has qual 2 (NoCallQual), n has qual 0 (padding)
2935            depths: vec![2, 2, 0, 2, 0],
2936            errors: vec![0, 0, 0, 0, 0],
2937            raw_read_count: 2,
2938            ref_start: 0,
2939            ref_end: 4,
2940            is_negative_strand: true,
2941        };
2942
2943        let duplex = caller
2944            .build_duplex_consensus_from_padded(&ss_a, &ss_b)
2945            .expect("Should build duplex consensus");
2946
2947        // Position 0: Both have A, should be A
2948        assert_eq!(duplex.bases[0], b'A', "Position 0: Both have A, should agree");
2949
2950        // Position 1: A has N (uppercase), B has T -> should mask to N
2951        assert_eq!(
2952            duplex.bases[1], b'N',
2953            "Position 1: Strand A has uppercase N (NoCall), should mask to N"
2954        );
2955
2956        // Position 2: A has A, B has N (uppercase) -> should mask to N
2957        assert_eq!(
2958            duplex.bases[2], b'N',
2959            "Position 2: Strand B has uppercase N (NoCall), should mask to N"
2960        );
2961
2962        // Position 3: A has n (lowercase padding), B has G -> should NOT mask, use G
2963        assert_eq!(
2964            duplex.bases[3], b'G',
2965            "Position 3: Strand A has lowercase n (padding), should use strand B's base G"
2966        );
2967
2968        // Position 4: A has C, B has n (lowercase padding) -> should NOT mask, use C
2969        assert_eq!(
2970            duplex.bases[4], b'C',
2971            "Position 4: Strand B has lowercase n (padding), should use strand A's base C"
2972        );
2973    }
2974
2975    #[test]
2976    fn test_to_source_read_for_codec_positive_strand() {
2977        use noodles::sam::alignment::record::cigar::op::Kind;
2978
2979        // Create a forward strand read
2980        let read = create_test_paired_read(
2981            "read1",
2982            b"ACGT",
2983            &[30, 31, 32, 33],
2984            true,
2985            false, // forward strand
2986            true,
2987            100,
2988            &[(Kind::Match, 4)],
2989        );
2990
2991        let source = CodecConsensusCaller::to_source_read_for_codec(&read, 0);
2992
2993        // Forward strand: no revcomp, no reversal
2994        assert_eq!(source.bases, b"ACGT");
2995        assert_eq!(source.quals, vec![30, 31, 32, 33]);
2996        assert_eq!(source.original_idx, 0);
2997    }
2998
2999    #[test]
3000    fn test_to_source_read_for_codec_negative_strand() {
3001        use noodles::sam::alignment::record::cigar::op::Kind;
3002
3003        // Create a reverse strand read
3004        let read = create_test_paired_read(
3005            "read1",
3006            b"ACGT",
3007            &[30, 31, 32, 33],
3008            true,
3009            true, // reverse strand
3010            false,
3011            100,
3012            &[(Kind::Match, 4)],
3013        );
3014
3015        let source = CodecConsensusCaller::to_source_read_for_codec(&read, 1);
3016
3017        // Reverse strand: revcomp bases, reverse quals
3018        assert_eq!(source.bases, b"ACGT"); // ACGT revcomp is ACGT (palindrome)
3019        assert_eq!(source.quals, vec![33, 32, 31, 30]); // reversed
3020        assert_eq!(source.original_idx, 1);
3021    }
3022
3023    #[test]
3024    fn test_to_source_read_for_codec_negative_strand_non_palindrome() {
3025        use noodles::sam::alignment::record::cigar::op::Kind;
3026
3027        // Create a reverse strand read with non-palindromic sequence
3028        let read = create_test_paired_read(
3029            "read1",
3030            b"AACC", // revcomp is GGTT
3031            &[10, 20, 30, 40],
3032            true,
3033            true, // reverse strand
3034            false,
3035            100,
3036            &[(Kind::Match, 4)],
3037        );
3038
3039        let source = CodecConsensusCaller::to_source_read_for_codec(&read, 2);
3040
3041        // Reverse strand: revcomp bases (AACC -> GGTT), reverse quals
3042        assert_eq!(source.bases, b"GGTT");
3043        assert_eq!(source.quals, vec![40, 30, 20, 10]); // reversed
3044        assert_eq!(source.original_idx, 2);
3045    }
3046
3047    #[test]
3048    fn test_to_source_read_clip_exceeds_sequence_length() {
3049        use noodles::sam::alignment::record::cigar::op::Kind;
3050
3051        // Create a 4bp read and clip more than 4bp - should not panic
3052        let read = create_test_paired_read(
3053            "read1",
3054            b"ACGT",
3055            &[30, 31, 32, 33],
3056            true,
3057            false, // forward strand
3058            true,
3059            100,
3060            &[(Kind::Match, 4)],
3061        );
3062
3063        let raw = CodecConsensusCaller::record_buf_to_raw(&read);
3064
3065        // Clip 10 bases from a 4bp read (clip_from_start = false)
3066        let source = CodecConsensusCaller::to_source_read_for_codec_raw(&raw, 0, 10, false, None);
3067        assert!(source.bases.is_empty(), "Bases should be empty after over-clipping");
3068        assert!(source.quals.is_empty(), "Quals should be empty after over-clipping");
3069
3070        // Clip 10 bases from start of a 4bp read (clip_from_start = true)
3071        let source = CodecConsensusCaller::to_source_read_for_codec_raw(&raw, 0, 10, true, None);
3072        assert!(source.bases.is_empty(), "Bases should be empty after over-clipping from start");
3073    }
3074
3075    #[test]
3076    fn test_to_source_read_clip_exact_sequence_length() {
3077        use noodles::sam::alignment::record::cigar::op::Kind;
3078
3079        // Clip exactly the sequence length - should produce empty but not panic
3080        let read = create_test_paired_read(
3081            "read1",
3082            b"ACGT",
3083            &[30, 31, 32, 33],
3084            true,
3085            false,
3086            true,
3087            100,
3088            &[(Kind::Match, 4)],
3089        );
3090
3091        let raw = CodecConsensusCaller::record_buf_to_raw(&read);
3092        let source = CodecConsensusCaller::to_source_read_for_codec_raw(&raw, 0, 4, false, None);
3093        assert!(source.bases.is_empty());
3094        assert!(source.quals.is_empty());
3095    }
3096
3097    // ========================================================================
3098    // build_clipped_info tests
3099    // ========================================================================
3100
3101    #[test]
3102    fn test_build_clipped_info_no_clip_forward() {
3103        use noodles::sam::alignment::record::cigar::op::Kind;
3104
3105        let read = create_test_paired_read(
3106            "read1",
3107            b"ACGTACGT",
3108            &[30; 8],
3109            true,
3110            false, // forward strand
3111            true,
3112            100, // 1-based pos -> 0-based = 99
3113            &[(Kind::Match, 8)],
3114        );
3115        let raw = CodecConsensusCaller::record_buf_to_raw(&read);
3116        let info = CodecConsensusCaller::build_clipped_info(&raw, 0, 0);
3117
3118        assert_eq!(info.raw_idx, 0);
3119        assert_eq!(info.clip_amount, 0);
3120        assert!(!info.clip_from_start, "Forward strand should not clip from start");
3121        assert_eq!(info.clipped_seq_len, 8);
3122        assert_eq!(info.adjusted_pos, 100); // 1-based
3123    }
3124
3125    #[test]
3126    fn test_build_clipped_info_clip_from_end_forward() {
3127        use noodles::sam::alignment::record::cigar::op::Kind;
3128
3129        let read = create_test_paired_read(
3130            "read1",
3131            b"ACGTACGT",
3132            &[30; 8],
3133            true,
3134            false, // forward strand
3135            true,
3136            100,
3137            &[(Kind::Match, 8)],
3138        );
3139        let raw = CodecConsensusCaller::record_buf_to_raw(&read);
3140        let info = CodecConsensusCaller::build_clipped_info(&raw, 5, 3);
3141
3142        assert_eq!(info.raw_idx, 5);
3143        assert_eq!(info.clip_amount, 3);
3144        assert!(!info.clip_from_start, "Forward strand clips from end");
3145        assert_eq!(info.clipped_seq_len, 5); // 8 - 3
3146        assert_eq!(info.adjusted_pos, 100); // Position unchanged for end clipping
3147    }
3148
3149    #[test]
3150    fn test_build_clipped_info_clip_from_start_reverse() {
3151        use noodles::sam::alignment::record::cigar::op::Kind;
3152
3153        let read = create_test_paired_read(
3154            "read1",
3155            b"ACGTACGT",
3156            &[30; 8],
3157            true,
3158            true, // reverse strand
3159            false,
3160            100,
3161            &[(Kind::Match, 8)],
3162        );
3163        let raw = CodecConsensusCaller::record_buf_to_raw(&read);
3164        let info = CodecConsensusCaller::build_clipped_info(&raw, 2, 3);
3165
3166        assert_eq!(info.raw_idx, 2);
3167        assert_eq!(info.clip_amount, 3);
3168        assert!(info.clip_from_start, "Reverse strand should clip from start");
3169        assert_eq!(info.clipped_seq_len, 5); // 8 - 3
3170        // Position adjusts forward by clipped reference bases
3171        assert!(info.adjusted_pos > 100);
3172    }
3173
3174    #[test]
3175    fn test_build_clipped_info_zero_clip_preserves_all() {
3176        use noodles::sam::alignment::record::cigar::op::Kind;
3177
3178        let read = create_test_paired_read(
3179            "read1",
3180            b"ACGT",
3181            &[30; 4],
3182            true,
3183            false,
3184            true,
3185            50,
3186            &[(Kind::Match, 4)],
3187        );
3188        let raw = CodecConsensusCaller::record_buf_to_raw(&read);
3189        let info = CodecConsensusCaller::build_clipped_info(&raw, 0, 0);
3190
3191        assert_eq!(info.clipped_seq_len, 4);
3192        assert_eq!(info.adjusted_pos, 50);
3193        assert_eq!(info.clipped_cigar.len(), 1); // Single 4M op
3194    }
3195
3196    #[test]
3197    fn test_vanilla_to_single_strand_conversion() {
3198        let vcr = VanillaConsensusRead {
3199            id: "test".to_string(),
3200            bases: vec![b'A', b'C', b'G', b'T'],
3201            quals: vec![30, 31, 32, 33],
3202            depths: vec![5, 6, 7, 8],
3203            errors: vec![0, 1, 0, 1],
3204            source_reads: None,
3205        };
3206
3207        let ss = CodecConsensusCaller::vanilla_to_single_strand(vcr.clone(), false, 10);
3208
3209        assert_eq!(ss.bases, vcr.bases);
3210        assert_eq!(ss.quals, vcr.quals);
3211        assert_eq!(ss.depths, vcr.depths);
3212        assert_eq!(ss.errors, vcr.errors);
3213        assert_eq!(ss.raw_read_count, 10);
3214        assert!(!ss.is_negative_strand);
3215        assert_eq!(ss.ref_start, 0);
3216        assert_eq!(ss.ref_end, 3); // len - 1
3217    }
3218
3219    #[test]
3220    fn test_vanilla_to_single_strand_negative_strand() {
3221        let vcr = VanillaConsensusRead {
3222            id: "test".to_string(),
3223            bases: vec![b'A', b'C'],
3224            quals: vec![30, 31],
3225            depths: vec![5, 6],
3226            errors: vec![0, 1],
3227            source_reads: None,
3228        };
3229
3230        let ss = CodecConsensusCaller::vanilla_to_single_strand(vcr, true, 5);
3231
3232        assert!(ss.is_negative_strand);
3233        assert_eq!(ss.raw_read_count, 5);
3234    }
3235
3236    #[test]
3237    fn test_codec_statistics_tracking() {
3238        // Test that CodecConsensusCaller properly tracks statistics
3239        let options = CodecConsensusOptions {
3240            min_reads_per_strand: 1,
3241            min_duplex_length: 1,
3242            ..Default::default()
3243        };
3244        let caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
3245
3246        let stats = caller.statistics();
3247
3248        // Initial statistics should be zero
3249        assert_eq!(stats.total_input_reads, 0);
3250        assert_eq!(stats.consensus_reads_generated, 0);
3251        assert_eq!(stats.reads_filtered, 0);
3252    }
3253
3254    // ==========================================================================
3255    // Tests for CIGAR-based utility functions
3256    // ==========================================================================
3257
3258    #[test]
3259    fn test_read_pos_at_ref_pos_simple_match() {
3260        use noodles::sam::alignment::record::cigar::op::Kind;
3261
3262        // Read at position 100 with 20M
3263        let read = create_test_paired_read(
3264            "read1",
3265            &[b'A'; 20],
3266            &[30; 20],
3267            true,
3268            false,
3269            true,
3270            100,
3271            &[(Kind::Match, 20)],
3272        );
3273
3274        // Query position at ref_pos 100 (start) should be 1 (1-based)
3275        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 100, false), Some(1));
3276
3277        // Query position at ref_pos 105 should be 6 (1-based)
3278        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 105, false), Some(6));
3279
3280        // Query position at ref_pos 119 (end) should be 20 (1-based)
3281        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 119, false), Some(20));
3282
3283        // Position before alignment should return None
3284        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 99, false), None);
3285
3286        // Position after alignment should return None
3287        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 120, false), None);
3288    }
3289
3290    #[test]
3291    fn test_read_pos_at_ref_pos_with_insertion() {
3292        use noodles::sam::alignment::record::cigar::op::Kind;
3293
3294        // Read at position 100 with 10M5I10M (25 query bases, 20 ref bases)
3295        let read = create_test_paired_read(
3296            "read1",
3297            &[b'A'; 25],
3298            &[30; 25],
3299            true,
3300            false,
3301            true,
3302            100,
3303            &[(Kind::Match, 10), (Kind::Insertion, 5), (Kind::Match, 10)],
3304        );
3305
3306        // Position in first match block
3307        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 100, false), Some(1));
3308        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 109, false), Some(10));
3309
3310        // Position in second match block (after insertion)
3311        // ref_pos 110 maps to query position 16 (10 from first M + 5 from I + 1)
3312        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 110, false), Some(16));
3313        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 119, false), Some(25));
3314    }
3315
3316    #[test]
3317    fn test_read_pos_at_ref_pos_with_deletion() {
3318        use noodles::sam::alignment::record::cigar::op::Kind;
3319
3320        // Read at position 100 with 10M5D10M (20 query bases, 25 ref bases)
3321        let read = create_test_paired_read(
3322            "read1",
3323            &[b'A'; 20],
3324            &[30; 20],
3325            true,
3326            false,
3327            true,
3328            100,
3329            &[(Kind::Match, 10), (Kind::Deletion, 5), (Kind::Match, 10)],
3330        );
3331
3332        // Position in first match block
3333        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 100, false), Some(1));
3334        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 109, false), Some(10));
3335
3336        // Position in deletion - should return None with return_last_base_if_deleted=false
3337        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 110, false), None);
3338        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 114, false), None);
3339
3340        // Position in deletion with return_last_base_if_deleted=true - should return last base before deletion
3341        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 110, true), Some(10));
3342        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 114, true), Some(10));
3343
3344        // Position in second match block (after deletion)
3345        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 115, false), Some(11));
3346        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 124, false), Some(20));
3347    }
3348
3349    #[test]
3350    fn test_read_pos_at_ref_pos_with_soft_clip() {
3351        use noodles::sam::alignment::record::cigar::op::Kind;
3352
3353        // Read at position 100 with 5S15M (20 query bases, 15 ref bases)
3354        // Soft clips consume query but not reference
3355        let read = create_test_paired_read(
3356            "read1",
3357            &[b'A'; 20],
3358            &[30; 20],
3359            true,
3360            false,
3361            true,
3362            100,
3363            &[(Kind::SoftClip, 5), (Kind::Match, 15)],
3364        );
3365
3366        // ref_pos 100 should map to query position 6 (after 5 soft-clipped bases)
3367        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 100, false), Some(6));
3368        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&read, 114, false), Some(20));
3369    }
3370
3371    #[test]
3372    fn test_check_overlap_phase_compatible() {
3373        use noodles::sam::alignment::record::cigar::op::Kind;
3374
3375        let options = CodecConsensusOptions::default();
3376        let caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
3377
3378        // Two reads with compatible CIGARs (both 30M)
3379        let r1 = create_test_paired_read(
3380            "read1",
3381            &[b'A'; 30],
3382            &[30; 30],
3383            true,
3384            false,
3385            true,
3386            100,
3387            &[(Kind::Match, 30)],
3388        );
3389        let r2 = create_test_paired_read(
3390            "read2",
3391            &[b'A'; 30],
3392            &[30; 30],
3393            false,
3394            true,
3395            false,
3396            110,
3397            &[(Kind::Match, 30)],
3398        );
3399
3400        // Overlap region: 110-129 (R2 start to R1 end)
3401        assert!(caller.check_overlap_phase(&r1, &r2, 110, 129));
3402    }
3403
3404    #[test]
3405    fn test_check_overlap_phase_indel_mismatch() {
3406        use noodles::sam::alignment::record::cigar::op::Kind;
3407
3408        let options = CodecConsensusOptions::default();
3409        let caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
3410
3411        // R1 with 30M, R2 with 15M5D15M
3412        // The overlap boundaries may land in the deletion
3413        let r1 = create_test_paired_read(
3414            "read1",
3415            &[b'A'; 30],
3416            &[30; 30],
3417            true,
3418            false,
3419            true,
3420            100,
3421            &[(Kind::Match, 30)],
3422        );
3423        let r2 = create_test_paired_read(
3424            "read2",
3425            &[b'A'; 30],
3426            &[30; 30],
3427            false,
3428            true,
3429            false,
3430            100,
3431            &[(Kind::Match, 15), (Kind::Deletion, 5), (Kind::Match, 15)],
3432        );
3433
3434        // Overlap region spanning the deletion should detect the phase mismatch
3435        // Result depends on whether the boundaries land in indels - just verify it runs
3436        let _result = caller.check_overlap_phase(&r1, &r2, 100, 129);
3437    }
3438
3439    #[test]
3440    fn test_pad_consensus_right() {
3441        // Test padding on the right (positive strand)
3442        let ss = SingleStrandConsensus {
3443            bases: b"ACGT".to_vec(),
3444            quals: vec![30, 31, 32, 33],
3445            depths: vec![5, 6, 7, 8],
3446            errors: vec![0, 1, 0, 1],
3447            raw_read_count: 5,
3448            ref_start: 100,
3449            ref_end: 103,
3450            is_negative_strand: false,
3451        };
3452
3453        let padded = CodecConsensusCaller::pad_consensus(&ss, 8, false);
3454
3455        assert_eq!(padded.bases.len(), 8);
3456        assert_eq!(&padded.bases[0..4], b"ACGT");
3457        assert_eq!(&padded.bases[4..8], b"nnnn"); // lowercase n for padding
3458        assert_eq!(padded.quals[4..8], vec![0, 0, 0, 0]);
3459        assert_eq!(padded.depths[4..8], vec![0, 0, 0, 0]);
3460    }
3461
3462    #[test]
3463    fn test_pad_consensus_left() {
3464        // Test padding on the left (negative strand)
3465        let ss = SingleStrandConsensus {
3466            bases: b"ACGT".to_vec(),
3467            quals: vec![30, 31, 32, 33],
3468            depths: vec![5, 6, 7, 8],
3469            errors: vec![0, 1, 0, 1],
3470            raw_read_count: 5,
3471            ref_start: 100,
3472            ref_end: 103,
3473            is_negative_strand: true,
3474        };
3475
3476        let padded = CodecConsensusCaller::pad_consensus(&ss, 8, true);
3477
3478        assert_eq!(padded.bases.len(), 8);
3479        assert_eq!(&padded.bases[0..4], b"nnnn"); // lowercase n for padding
3480        assert_eq!(&padded.bases[4..8], b"ACGT");
3481        assert_eq!(padded.quals[0..4], vec![0, 0, 0, 0]);
3482        assert_eq!(padded.depths[0..4], vec![0, 0, 0, 0]);
3483    }
3484
3485    #[test]
3486    fn test_pad_consensus_no_padding_needed() {
3487        // Test when consensus is already the target length
3488        let ss = SingleStrandConsensus {
3489            bases: b"ACGT".to_vec(),
3490            quals: vec![30, 31, 32, 33],
3491            depths: vec![5, 6, 7, 8],
3492            errors: vec![0, 1, 0, 1],
3493            raw_read_count: 5,
3494            ref_start: 100,
3495            ref_end: 103,
3496            is_negative_strand: false,
3497        };
3498
3499        let padded = CodecConsensusCaller::pad_consensus(&ss, 4, false);
3500
3501        // Should return unchanged
3502        assert_eq!(padded.bases, ss.bases);
3503        assert_eq!(padded.quals, ss.quals);
3504    }
3505
3506    #[test]
3507    fn test_pad_consensus_shorter_target() {
3508        // Test when target length is shorter than current (should return unchanged)
3509        let ss = SingleStrandConsensus {
3510            bases: b"ACGT".to_vec(),
3511            quals: vec![30, 31, 32, 33],
3512            depths: vec![5, 6, 7, 8],
3513            errors: vec![0, 1, 0, 1],
3514            raw_read_count: 5,
3515            ref_start: 100,
3516            ref_end: 103,
3517            is_negative_strand: false,
3518        };
3519
3520        let padded = CodecConsensusCaller::pad_consensus(&ss, 2, false);
3521
3522        // Should return unchanged since new_length <= current_len
3523        assert_eq!(padded.bases.len(), 4);
3524    }
3525
3526    #[test]
3527    fn test_mask_consensus_quals_query_based_single_strand() {
3528        // Test query-based masking where padding indicates single-strand
3529        let options = CodecConsensusOptions { single_strand_qual: Some(10), ..Default::default() };
3530        let caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
3531
3532        let consensus = SingleStrandConsensus {
3533            bases: b"ACGT".to_vec(),
3534            quals: vec![30, 30, 30, 30],
3535            depths: vec![5; 4],
3536            errors: vec![0; 4],
3537            raw_read_count: 5,
3538            ref_start: 0,
3539            ref_end: 3,
3540            is_negative_strand: false,
3541        };
3542
3543        // R1 has N at position 0 (padding)
3544        let padded_r1 = SingleStrandConsensus {
3545            bases: b"NCGT".to_vec(),
3546            quals: vec![0, 30, 30, 30],
3547            depths: vec![0, 5, 5, 5],
3548            errors: vec![0; 4],
3549            raw_read_count: 5,
3550            ref_start: 0,
3551            ref_end: 3,
3552            is_negative_strand: false,
3553        };
3554
3555        // R2 has N at position 3 (padding)
3556        let padded_r2 = SingleStrandConsensus {
3557            bases: b"ACGN".to_vec(),
3558            quals: vec![30, 30, 30, 0],
3559            depths: vec![5, 5, 5, 0],
3560            errors: vec![0; 4],
3561            raw_read_count: 5,
3562            ref_start: 0,
3563            ref_end: 3,
3564            is_negative_strand: true,
3565        };
3566
3567        let masked = caller.mask_consensus_quals_query_based(consensus, &padded_r1, &padded_r2);
3568
3569        // Position 0: R1 has N, so single-strand -> qual=10
3570        assert_eq!(masked.quals[0], 10);
3571        // Position 3: R2 has N, so single-strand -> qual=10
3572        assert_eq!(masked.quals[3], 10);
3573        // Positions 1-2: both have data, duplex -> keep original
3574        assert_eq!(masked.quals[1], 30);
3575        assert_eq!(masked.quals[2], 30);
3576    }
3577
3578    #[test]
3579    fn test_is_cigar_prefix_exact_match() {
3580        use noodles::sam::alignment::record::cigar::op::Kind;
3581
3582        // Same CIGAR
3583        let a = vec![(Kind::Match, 50)];
3584        let b = vec![(Kind::Match, 50)];
3585        assert!(cigar_utils::is_cigar_prefix(&a, &b));
3586    }
3587
3588    #[test]
3589    fn test_is_cigar_prefix_shorter_last_element() {
3590        use noodles::sam::alignment::record::cigar::op::Kind;
3591
3592        // a is shorter in last element
3593        let a = vec![(Kind::Match, 40)];
3594        let b = vec![(Kind::Match, 50)];
3595        assert!(cigar_utils::is_cigar_prefix(&a, &b));
3596    }
3597
3598    #[test]
3599    fn test_is_cigar_prefix_longer_not_prefix() {
3600        use noodles::sam::alignment::record::cigar::op::Kind;
3601
3602        // a is longer than b - not a prefix
3603        let a = vec![(Kind::Match, 50)];
3604        let b = vec![(Kind::Match, 40)];
3605        assert!(!cigar_utils::is_cigar_prefix(&a, &b));
3606    }
3607
3608    #[test]
3609    fn test_is_cigar_prefix_different_ops() {
3610        use noodles::sam::alignment::record::cigar::op::Kind;
3611
3612        // Different operations
3613        let a = vec![(Kind::Match, 50)];
3614        let b = vec![(Kind::Insertion, 50)];
3615        assert!(!cigar_utils::is_cigar_prefix(&a, &b));
3616    }
3617
3618    #[test]
3619    fn test_is_cigar_prefix_complex() {
3620        use noodles::sam::alignment::record::cigar::op::Kind;
3621
3622        // Complex CIGAR with matching indels
3623        let a = vec![(Kind::Match, 25), (Kind::Deletion, 5), (Kind::Match, 20)];
3624        let b = vec![(Kind::Match, 25), (Kind::Deletion, 5), (Kind::Match, 30)];
3625        assert!(cigar_utils::is_cigar_prefix(&a, &b));
3626
3627        // Different deletion length - not a prefix
3628        let c = vec![(Kind::Match, 25), (Kind::Deletion, 3), (Kind::Match, 20)];
3629        assert!(!cigar_utils::is_cigar_prefix(&c, &b));
3630    }
3631
3632    #[test]
3633    fn test_reverse_complement_ss_depths_errors_reversed() {
3634        // Verify that depths and errors are reversed along with bases/quals for CODEC
3635        let ss = SingleStrandConsensus {
3636            bases: b"ACGT".to_vec(),
3637            quals: vec![10, 20, 30, 40],
3638            depths: vec![1, 2, 3, 4],
3639            errors: vec![5, 6, 7, 8],
3640            raw_read_count: 5,
3641            ref_start: 100,
3642            ref_end: 103,
3643            is_negative_strand: false,
3644        };
3645
3646        let rc = CodecConsensusCaller::reverse_complement_ss(&ss);
3647
3648        // Bases should be reverse complemented
3649        assert_eq!(rc.bases, b"ACGT"); // ACGT revcomp is ACGT (palindrome)
3650
3651        // Quals should be reversed
3652        assert_eq!(rc.quals, vec![40, 30, 20, 10]);
3653
3654        // For CODEC query-based consensus, depths and errors ARE reversed
3655        assert_eq!(rc.depths, vec![4, 3, 2, 1]);
3656        assert_eq!(rc.errors, vec![8, 7, 6, 5]);
3657
3658        // Strand flag should be flipped
3659        assert!(rc.is_negative_strand);
3660    }
3661
3662    #[test]
3663    fn test_downsample_pairs() {
3664        use noodles::sam::alignment::record::cigar::op::Kind;
3665
3666        let options = CodecConsensusOptions { max_reads_per_strand: Some(2), ..Default::default() };
3667        let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
3668
3669        // Create 5 R1s and 5 R2s
3670        let r1s: Vec<RecordBuf> = (0..5)
3671            .map(|i| {
3672                create_test_paired_read(
3673                    &format!("r{i}"),
3674                    &[b'A'; 20],
3675                    &[30; 20],
3676                    true,
3677                    false,
3678                    true,
3679                    100,
3680                    &[(Kind::Match, 20)],
3681                )
3682            })
3683            .collect();
3684        let r2s: Vec<RecordBuf> = (0..5)
3685            .map(|i| {
3686                create_test_paired_read(
3687                    &format!("r{i}"),
3688                    &[b'A'; 20],
3689                    &[30; 20],
3690                    false,
3691                    true,
3692                    false,
3693                    110,
3694                    &[(Kind::Match, 20)],
3695                )
3696            })
3697            .collect();
3698
3699        let (ds_r1s, ds_r2s) = caller.downsample_pairs(r1s, r2s);
3700
3701        assert_eq!(ds_r1s.len(), 2);
3702        assert_eq!(ds_r2s.len(), 2);
3703    }
3704
3705    #[test]
3706    fn test_downsample_pairs_no_limit() {
3707        use noodles::sam::alignment::record::cigar::op::Kind;
3708
3709        let options = CodecConsensusOptions { max_reads_per_strand: None, ..Default::default() };
3710        let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
3711
3712        let r1s: Vec<RecordBuf> = (0..5)
3713            .map(|i| {
3714                create_test_paired_read(
3715                    &format!("r{i}"),
3716                    &[b'A'; 20],
3717                    &[30; 20],
3718                    true,
3719                    false,
3720                    true,
3721                    100,
3722                    &[(Kind::Match, 20)],
3723                )
3724            })
3725            .collect();
3726        let r2s: Vec<RecordBuf> = (0..5)
3727            .map(|i| {
3728                create_test_paired_read(
3729                    &format!("r{i}"),
3730                    &[b'A'; 20],
3731                    &[30; 20],
3732                    false,
3733                    true,
3734                    false,
3735                    110,
3736                    &[(Kind::Match, 20)],
3737                )
3738            })
3739            .collect();
3740
3741        let (ds_r1s, ds_r2s) = caller.downsample_pairs(r1s, r2s);
3742
3743        // No limit, so all reads should be kept
3744        assert_eq!(ds_r1s.len(), 5);
3745        assert_eq!(ds_r2s.len(), 5);
3746    }
3747
3748    #[test]
3749    fn test_compute_codec_consensus_length() {
3750        use noodles::sam::alignment::record::cigar::op::Kind;
3751
3752        // R1 (positive strand) at pos 100 with 30M
3753        // R2 (negative strand) at pos 110 with 30M
3754        // Overlap end is at position 129 (R1 end)
3755        let pos_read = create_test_paired_read(
3756            "pos",
3757            &[b'A'; 30],
3758            &[30; 30],
3759            true,
3760            false,
3761            true,
3762            100,
3763            &[(Kind::Match, 30)],
3764        );
3765        let neg_read = create_test_paired_read(
3766            "neg",
3767            &[b'A'; 30],
3768            &[30; 30],
3769            false,
3770            true,
3771            false,
3772            110,
3773            &[(Kind::Match, 30)],
3774        );
3775
3776        // Overlap end at 129 (R1's end position)
3777        let length =
3778            CodecConsensusCaller::compute_codec_consensus_length(&pos_read, &neg_read, 129);
3779
3780        // pos_read_pos at 129 = 30 (last base)
3781        // neg_read_pos at 129 = 20 (position 129-110+1 = 20)
3782        // neg_length = 30
3783        // consensus_length = 30 + 30 - 20 = 40
3784        assert_eq!(length, Some(40));
3785    }
3786
3787    #[test]
3788    fn test_compute_codec_consensus_length_in_deletion() {
3789        use noodles::sam::alignment::record::cigar::op::Kind;
3790
3791        // Positive read with 15M5D15M
3792        let pos_read = create_test_paired_read(
3793            "pos",
3794            &[b'A'; 30],
3795            &[30; 30],
3796            true,
3797            false,
3798            true,
3799            100,
3800            &[(Kind::Match, 15), (Kind::Deletion, 5), (Kind::Match, 15)],
3801        );
3802        let neg_read = create_test_paired_read(
3803            "neg",
3804            &[b'A'; 30],
3805            &[30; 30],
3806            false,
3807            true,
3808            false,
3809            100,
3810            &[(Kind::Match, 30)],
3811        );
3812
3813        // Overlap end at position 116 (inside the deletion of pos_read at 115-119)
3814        let length =
3815            CodecConsensusCaller::compute_codec_consensus_length(&pos_read, &neg_read, 116);
3816
3817        // Should return None because position 116 is in a deletion
3818        assert!(length.is_none());
3819    }
3820
3821    #[test]
3822    fn test_reject_records_tracking() {
3823        use noodles::sam::alignment::record::cigar::op::Kind;
3824
3825        let options = CodecConsensusOptions {
3826            min_reads_per_strand: 2, // Require 2 reads per strand
3827            ..Default::default()
3828        };
3829        let mut caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
3830
3831        // Create single pair - should be rejected for insufficient reads
3832        let reads = create_fr_pair(
3833            "read1",
3834            100,
3835            110,
3836            30,
3837            35,
3838            &[(Kind::Match, 30)],
3839            &[(Kind::Match, 30)],
3840            "UMI1",
3841            None,
3842            false,
3843            true,
3844        );
3845
3846        let output = caller.consensus_reads_from_sam_records(reads).unwrap();
3847
3848        assert_eq!(output.count, 0);
3849        assert!(
3850            caller.stats.rejection_reasons.contains_key(&CallerRejectionReason::InsufficientReads)
3851        );
3852    }
3853
3854    #[test]
3855    fn test_rejected_reads_tracking_enabled() {
3856        use noodles::sam::alignment::record::cigar::op::Kind;
3857
3858        let options = CodecConsensusOptions {
3859            min_reads_per_strand: 2, // Require 2 reads per strand
3860            ..Default::default()
3861        };
3862        let mut caller = CodecConsensusCaller::new_with_rejects_tracking(
3863            "codec".to_string(),
3864            "RG1".to_string(),
3865            options,
3866            true, // Enable tracking
3867        );
3868
3869        let reads = create_fr_pair(
3870            "read1",
3871            100,
3872            110,
3873            30,
3874            35,
3875            &[(Kind::Match, 30)],
3876            &[(Kind::Match, 30)],
3877            "UMI1",
3878            None,
3879            false,
3880            true,
3881        );
3882
3883        let output = caller.consensus_reads_from_sam_records(reads).unwrap();
3884
3885        assert_eq!(output.count, 0);
3886        // Rejected reads should be tracked
3887        assert!(!caller.rejected_reads().is_empty() || caller.statistics().reads_filtered > 0);
3888    }
3889
3890    #[test]
3891    fn test_clear_rejected_reads() {
3892        let options = CodecConsensusOptions::default();
3893        let mut caller = CodecConsensusCaller::new_with_rejects_tracking(
3894            "codec".to_string(),
3895            "RG1".to_string(),
3896            options,
3897            true,
3898        );
3899
3900        // Initially empty
3901        assert!(caller.rejected_reads().is_empty());
3902
3903        // Clear (should be no-op on empty)
3904        caller.clear_rejected_reads();
3905        assert!(caller.rejected_reads().is_empty());
3906    }
3907
3908    // ============================================================================
3909    // Tests for additional HIGH risk code paths
3910    // ============================================================================
3911
3912    #[test]
3913    fn test_read_pos_at_ref_pos_with_hard_clip() {
3914        // Create a read with 2H5M2H CIGAR at position 100 (hard clips don't appear in seq)
3915        let record = RecordBuilder::new()
3916            .sequence("ACGTA") // 5 bases for 5M
3917            .alignment_start(100)
3918            .cigar("2H5M2H")
3919            .build();
3920
3921        // Position 100 maps to query pos 1 (hard clips don't count)
3922        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&record, 100, false), Some(1));
3923        // Position 104 maps to query pos 5
3924        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&record, 104, false), Some(5));
3925    }
3926
3927    #[test]
3928    fn test_read_pos_at_ref_pos_deletion_at_start() {
3929        // Create a read with 2D5M at position 100
3930        // Ref:   100 101 102 103 104 105 106
3931        // Query:   -   -   1   2   3   4   5
3932        let record = RecordBuilder::new()
3933            .sequence("ACGTA") // 5 bases for 5M
3934            .alignment_start(100)
3935            .cigar("2D5M")
3936            .build();
3937
3938        // Position 100 is in deletion at start
3939        // return_last_base_if_deleted=true with query_offset=0 should return 1
3940        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&record, 100, true), Some(1));
3941        assert_eq!(CodecConsensusCaller::read_pos_at_ref_pos(&record, 100, false), None);
3942    }
3943
3944    #[test]
3945    fn test_check_overlap_phase_matching_cigars() {
3946        let options = CodecConsensusOptions::default();
3947        let caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
3948
3949        // Create two reads that overlap perfectly
3950        let r1 = RecordBuilder::new()
3951            .sequence(&"A".repeat(50))
3952            .alignment_start(100)
3953            .cigar("50M")
3954            .build();
3955
3956        let r2 = RecordBuilder::new()
3957            .sequence(&"A".repeat(50))
3958            .alignment_start(120)
3959            .cigar("50M")
3960            .build();
3961
3962        // Overlap region: 120-149
3963        assert!(caller.check_overlap_phase(&r1, &r2, 120, 149));
3964    }
3965
3966    #[test]
3967    fn test_check_overlap_phase_deletion_mismatch() {
3968        let options = CodecConsensusOptions::default();
3969        let caller = CodecConsensusCaller::new("codec".to_string(), "RG1".to_string(), options);
3970
3971        // Create two reads where one has a deletion
3972        let r1 = RecordBuilder::new()
3973            .sequence(&"A".repeat(50))
3974            .alignment_start(100)
3975            .cigar("50M")
3976            .build();
3977
3978        // r2 has a 2bp deletion in the overlap region
3979        let r2 = RecordBuilder::new()
3980            .sequence(&"A".repeat(48)) // 15M + 33M = 48 bases (deletion doesn't consume query)
3981            .alignment_start(120)
3982            .cigar("15M2D33M")
3983            .build();
3984
3985        // Overlap region spans the deletion - phases should differ
3986        assert!(!caller.check_overlap_phase(&r1, &r2, 120, 149));
3987    }
3988}