Skip to main content

fgumi_consensus/
filter.rs

1//! Consensus read filtering logic.
2//!
3//! This module provides functionality for filtering consensus reads based on quality, depth,
4//! and error rate thresholds. It supports both single-strand and duplex consensus reads.
5
6use ahash::AHashMap;
7use anyhow::Result;
8use noodles::sam::alignment::record::Sequence as SequenceTrait;
9use noodles::sam::alignment::record_buf::data::field::Value;
10use noodles::sam::alignment::record_buf::{QualityScores, RecordBuf, Sequence};
11
12use crate::phred::{MIN_PHRED, NO_CALL_BASE};
13use crate::tags::{per_base, per_read};
14use fgumi_metrics::rejection::RejectionReason;
15
16/// Filter thresholds for consensus reads
17#[derive(Debug, Clone)]
18pub struct FilterThresholds {
19    /// Minimum number of raw reads to support a consensus base/read
20    pub min_reads: usize,
21
22    /// Maximum raw read error rate (0.0-1.0)
23    pub max_read_error_rate: f64,
24
25    /// Maximum base error rate (0.0-1.0)
26    pub max_base_error_rate: f64,
27}
28
29/// Consensus read type
30#[derive(Debug, Clone, Copy, PartialEq, Eq)]
31pub enum ConsensusType {
32    /// Single-strand consensus (from `simplex`)
33    SingleStrand,
34
35    /// Duplex consensus (from `duplex`)
36    Duplex,
37}
38
39/// Filtering configuration for consensus reads
40#[derive(Debug, Clone)]
41pub struct FilterConfig {
42    /// Thresholds for duplex consensus reads (if applicable)
43    pub duplex_thresholds: Option<FilterThresholds>,
44
45    /// Thresholds for AB/top-strand consensus
46    pub ab_thresholds: Option<FilterThresholds>,
47
48    /// Thresholds for BA/bottom-strand consensus
49    pub ba_thresholds: Option<FilterThresholds>,
50
51    /// Thresholds for single-strand consensus
52    pub single_strand_thresholds: Option<FilterThresholds>,
53
54    /// Minimum base quality after masking (optional - None means no quality masking)
55    pub min_base_quality: Option<u8>,
56
57    /// Minimum mean base quality after masking (optional)
58    pub min_mean_base_quality: Option<f64>,
59
60    /// Maximum fraction of no-calls (N bases) allowed (0.0-1.0)
61    pub max_no_call_fraction: f64,
62}
63
64impl FilterConfig {
65    /// Creates a filter configuration for single-strand (simplex) consensus reads.
66    ///
67    /// This is the simplest configuration - one set of thresholds applied to all reads.
68    ///
69    /// # Arguments
70    /// * `thresholds` - Filter thresholds for single-strand consensus
71    /// * `min_base_quality` - Minimum base quality after masking (None means no quality masking)
72    /// * `min_mean_base_quality` - Optional minimum mean base quality
73    /// * `max_no_call_fraction` - Maximum fraction of N bases allowed
74    #[must_use]
75    pub fn for_single_strand(
76        thresholds: FilterThresholds,
77        min_base_quality: Option<u8>,
78        min_mean_base_quality: Option<f64>,
79        max_no_call_fraction: f64,
80    ) -> Self {
81        Self {
82            duplex_thresholds: Some(thresholds.clone()),
83            ab_thresholds: Some(thresholds.clone()),
84            ba_thresholds: Some(thresholds.clone()),
85            single_strand_thresholds: Some(thresholds),
86            min_base_quality,
87            min_mean_base_quality,
88            max_no_call_fraction,
89        }
90    }
91
92    /// Creates a filter configuration for symmetric duplex consensus reads.
93    ///
94    /// Uses the same thresholds for both AB and BA strands.
95    ///
96    /// # Arguments
97    /// * `duplex` - Filter thresholds for final duplex consensus
98    /// * `strand` - Filter thresholds for both AB and BA strands (symmetric)
99    /// * `min_base_quality` - Minimum base quality after masking (None means no quality masking)
100    /// * `min_mean_base_quality` - Optional minimum mean base quality
101    /// * `max_no_call_fraction` - Maximum fraction of N bases allowed
102    ///
103    /// # Panics
104    ///
105    /// Panics if thresholds violate ordering constraints (`strand.min_reads` <= `duplex.min_reads`, etc.)
106    #[must_use]
107    pub fn for_duplex(
108        duplex: FilterThresholds,
109        strand: FilterThresholds,
110        min_base_quality: Option<u8>,
111        min_mean_base_quality: Option<f64>,
112        max_no_call_fraction: f64,
113    ) -> Self {
114        Self::for_duplex_asymmetric(
115            duplex,
116            strand.clone(),
117            strand,
118            min_base_quality,
119            min_mean_base_quality,
120            max_no_call_fraction,
121        )
122    }
123
124    /// Creates a filter configuration for asymmetric duplex consensus reads.
125    ///
126    /// Uses different thresholds for AB (higher depth strand) and BA (lower depth strand).
127    ///
128    /// # Arguments
129    /// * `duplex` - Filter thresholds for final duplex consensus
130    /// * `ab` - Filter thresholds for AB strand (typically higher depth)
131    /// * `ba` - Filter thresholds for BA strand (typically lower depth)
132    /// * `min_base_quality` - Minimum base quality after masking (None means no quality masking)
133    /// * `min_mean_base_quality` - Optional minimum mean base quality
134    /// * `max_no_call_fraction` - Maximum fraction of N bases allowed
135    ///
136    /// # Panics
137    ///
138    /// Panics if thresholds violate ordering constraints:
139    /// - `min_reads`: BA <= AB <= duplex
140    /// - error rates: AB <= BA (AB more stringent)
141    #[must_use]
142    pub fn for_duplex_asymmetric(
143        duplex: FilterThresholds,
144        ab: FilterThresholds,
145        ba: FilterThresholds,
146        min_base_quality: Option<u8>,
147        min_mean_base_quality: Option<f64>,
148        max_no_call_fraction: f64,
149    ) -> Self {
150        // Validate threshold ordering (matching fgbio's validation)
151        assert!(
152            ab.min_reads <= duplex.min_reads,
153            "min-reads values must be specified high to low: AB ({}) > duplex ({})",
154            ab.min_reads,
155            duplex.min_reads
156        );
157        assert!(
158            ba.min_reads <= ab.min_reads,
159            "min-reads values must be specified high to low: BA ({}) > AB ({})",
160            ba.min_reads,
161            ab.min_reads
162        );
163        assert!(
164            ab.max_read_error_rate <= ba.max_read_error_rate,
165            "max-read-error-rate for AB ({}) must be <= BA ({})",
166            ab.max_read_error_rate,
167            ba.max_read_error_rate
168        );
169        assert!(
170            ab.max_base_error_rate <= ba.max_base_error_rate,
171            "max-base-error-rate for AB ({}) must be <= BA ({})",
172            ab.max_base_error_rate,
173            ba.max_base_error_rate
174        );
175
176        Self {
177            duplex_thresholds: Some(duplex.clone()),
178            ab_thresholds: Some(ab),
179            ba_thresholds: Some(ba),
180            single_strand_thresholds: Some(duplex),
181            min_base_quality,
182            min_mean_base_quality,
183            max_no_call_fraction,
184        }
185    }
186
187    /// Returns duplex (CC), AB, and BA thresholds, or `None` if any are missing.
188    #[must_use]
189    pub fn duplex_thresholds(
190        &self,
191    ) -> Option<(&FilterThresholds, &FilterThresholds, &FilterThresholds)> {
192        match (
193            self.duplex_thresholds.as_ref(),
194            self.ab_thresholds.as_ref(),
195            self.ba_thresholds.as_ref(),
196        ) {
197            (Some(cc), Some(ab), Some(ba)) => Some((cc, ab, ba)),
198            _ => None,
199        }
200    }
201
202    /// Returns the single-strand thresholds, falling back to duplex thresholds.
203    #[must_use]
204    pub fn effective_single_strand_thresholds(&self) -> Option<&FilterThresholds> {
205        self.single_strand_thresholds.as_ref().or(self.duplex_thresholds.as_ref())
206    }
207
208    /// Creates a new filter configuration from parameter vectors
209    ///
210    /// # Arguments
211    /// * `min_reads` - 1-3 values for [duplex, AB, BA] or [single-strand]
212    /// * `max_read_error_rate` - 1-3 values for [duplex, AB, BA] or [single-strand]
213    /// * `max_base_error_rate` - 1-3 values for [duplex, AB, BA] or [single-strand]
214    /// * `min_base_quality` - Minimum base quality after masking (None means no quality masking)
215    /// * `min_mean_base_quality` - Optional minimum mean base quality
216    /// * `max_no_call_fraction` - Maximum fraction of N bases allowed
217    ///
218    /// # Panics
219    ///
220    /// Panics if thresholds violate ordering constraints:
221    /// - `min_reads`: BA <= AB <= duplex
222    /// - error rates: AB <= BA (AB more stringent)
223    #[must_use]
224    pub fn new(
225        min_reads: &[usize],
226        max_read_error_rate: &[f64],
227        max_base_error_rate: &[f64],
228        min_base_quality: Option<u8>,
229        min_mean_base_quality: Option<f64>,
230        max_no_call_fraction: f64,
231    ) -> Self {
232        // Helper to get threshold at index with fallback
233        let get_threshold = |values: &[usize], index: usize| -> usize {
234            match values.len() {
235                2 => {
236                    if index == 0 {
237                        values[0]
238                    } else {
239                        values[1]
240                    }
241                }
242                3 => values[index],
243                _ => values[0],
244            }
245        };
246
247        let get_threshold_f64 = |values: &[f64], index: usize| -> f64 {
248            match values.len() {
249                2 => {
250                    if index == 0 {
251                        values[0]
252                    } else {
253                        values[1]
254                    }
255                }
256                3 => values[index],
257                _ => values[0],
258            }
259        };
260
261        // Create thresholds for all levels - matching fgbio which always creates all three
262        // when filtering either simplex or duplex reads. Single values are replicated to all levels.
263        let duplex_thresholds = Some(FilterThresholds {
264            min_reads: get_threshold(min_reads, 0),
265            max_read_error_rate: get_threshold_f64(max_read_error_rate, 0),
266            max_base_error_rate: get_threshold_f64(max_base_error_rate, 0),
267        });
268
269        let ab_thresholds = Some(FilterThresholds {
270            min_reads: get_threshold(min_reads, 1),
271            max_read_error_rate: get_threshold_f64(max_read_error_rate, 1),
272            max_base_error_rate: get_threshold_f64(max_base_error_rate, 1),
273        });
274
275        let ba_thresholds = Some(FilterThresholds {
276            min_reads: get_threshold(min_reads, 2),
277            max_read_error_rate: get_threshold_f64(max_read_error_rate, 2),
278            max_base_error_rate: get_threshold_f64(max_base_error_rate, 2),
279        });
280
281        // Also create single-strand thresholds using the first value
282        let single_strand_thresholds = Some(FilterThresholds {
283            min_reads: min_reads[0],
284            max_read_error_rate: if max_read_error_rate.is_empty() {
285                1.0
286            } else {
287                max_read_error_rate[0]
288            },
289            max_base_error_rate: if max_base_error_rate.is_empty() {
290                1.0
291            } else {
292                max_base_error_rate[0]
293            },
294        });
295
296        // Validate threshold ordering for duplex mode (matching fgbio's validation)
297        // For depth thresholds: BA <= AB <= CC (values must be specified high to low)
298        // For error rates: AB <= BA (AB must be more stringent than BA)
299        if let (Some(cc), Some(ab), Some(ba)) = (&duplex_thresholds, &ab_thresholds, &ba_thresholds)
300        {
301            // min_reads: CC >= AB >= BA
302            assert!(
303                ab.min_reads <= cc.min_reads,
304                "min-reads values must be specified high to low: AB ({}) > CC ({})",
305                ab.min_reads,
306                cc.min_reads
307            );
308            assert!(
309                ba.min_reads <= ab.min_reads,
310                "min-reads values must be specified high to low: BA ({}) > AB ({})",
311                ba.min_reads,
312                ab.min_reads
313            );
314
315            // max_read_error_rate: AB <= BA (AB more stringent)
316            assert!(
317                ab.max_read_error_rate <= ba.max_read_error_rate,
318                "max-read-error-rate for AB ({}) must be <= BA ({})",
319                ab.max_read_error_rate,
320                ba.max_read_error_rate
321            );
322
323            // max_base_error_rate: AB <= BA (AB more stringent)
324            assert!(
325                ab.max_base_error_rate <= ba.max_base_error_rate,
326                "max-base-error-rate for AB ({}) must be <= BA ({})",
327                ab.max_base_error_rate,
328                ba.max_base_error_rate
329            );
330        }
331
332        Self {
333            duplex_thresholds,
334            ab_thresholds,
335            ba_thresholds,
336            single_strand_thresholds,
337            min_base_quality,
338            min_mean_base_quality,
339            max_no_call_fraction,
340        }
341    }
342}
343
344/// Result of filtering a consensus read
345#[derive(Debug, Clone, Copy, PartialEq, Eq)]
346pub enum FilterResult {
347    /// Read passed all filters
348    Pass,
349
350    /// Read failed minimum reads threshold
351    InsufficientReads,
352
353    /// Read failed maximum error rate threshold
354    ExcessiveErrorRate,
355
356    /// Read failed minimum mean base quality threshold
357    LowQuality,
358
359    /// Read failed maximum no-call fraction threshold
360    TooManyNoCalls,
361}
362
363impl FilterResult {
364    /// Converts a `FilterResult` to a `RejectionReason`, if rejected.
365    ///
366    /// Returns `None` if the result is `Pass`, otherwise returns the corresponding
367    /// rejection reason for tracking in metrics and logging.
368    #[must_use]
369    pub fn to_rejection_reason(&self) -> Option<RejectionReason> {
370        match self {
371            Self::Pass => None,
372            Self::InsufficientReads => Some(RejectionReason::InsufficientSupport),
373            Self::ExcessiveErrorRate => Some(RejectionReason::ExcessiveErrorRate),
374            Self::LowQuality => Some(RejectionReason::LowMeanQuality),
375            Self::TooManyNoCalls => Some(RejectionReason::ExcessiveNBases),
376        }
377    }
378}
379
380/// Helper function to extract integer value from SAM tag, handling different integer types
381fn extract_int_tag(data: &noodles::sam::alignment::record_buf::Data, tag_str: &str) -> Option<i32> {
382    let tag = per_read::tag(tag_str);
383    match data.get(&tag)? {
384        Value::Int8(v) => Some(i32::from(*v)),
385        Value::UInt8(v) => Some(i32::from(*v)),
386        Value::Int16(v) => Some(i32::from(*v)),
387        Value::UInt16(v) => Some(i32::from(*v)),
388        Value::Int32(v) => Some(*v),
389        #[expect(
390            clippy::cast_possible_wrap,
391            reason = "BAM tag values in practice never exceed i32::MAX"
392        )]
393        Value::UInt32(v) => Some(*v as i32),
394        _ => None,
395    }
396}
397
398/// Helper function to extract float value from SAM tag
399fn extract_float_tag(
400    data: &noodles::sam::alignment::record_buf::Data,
401    tag_str: &str,
402) -> Option<f32> {
403    let tag = per_read::tag(tag_str);
404    match data.get(&tag)? {
405        Value::Float(v) => Some(*v),
406        _ => None,
407    }
408}
409
410/// Filters a consensus read based on per-read tags
411///
412/// For duplex reads, also checks AB and BA strand tags independently.
413///
414/// # Arguments
415/// * `record` - The consensus read to filter
416/// * `thresholds` - The filter thresholds to apply
417/// * `ab_thresholds` - Optional thresholds for AB strand (duplex only)
418/// * `ba_thresholds` - Optional thresholds for BA strand (duplex only)
419///
420/// # Returns
421/// `FilterResult` indicating whether the read passed or why it failed
422///
423/// # Errors
424///
425/// Returns an error if the record data cannot be read.
426pub fn filter_read(record: &RecordBuf, thresholds: &FilterThresholds) -> Result<FilterResult> {
427    // Check minimum reads (cD tag)
428    let depth_tag = per_read::tag("cD");
429    if let Some(Value::UInt8(depth)) = record.data().get(&depth_tag) {
430        if (*depth as usize) < thresholds.min_reads {
431            return Ok(FilterResult::InsufficientReads);
432        }
433    }
434
435    // Check maximum error rate (cE tag)
436    // Note: Promote f32 to f64 for comparison (matching fgbio's Float->Double promotion)
437    let error_tag = per_read::tag("cE");
438    if let Some(Value::Float(error_rate)) = record.data().get(&error_tag) {
439        if f64::from(*error_rate) > thresholds.max_read_error_rate {
440            return Ok(FilterResult::ExcessiveErrorRate);
441        }
442    }
443
444    Ok(FilterResult::Pass)
445}
446
447/// Filters a duplex consensus read based on per-read tags, checking both AB and BA strands
448///
449/// # Arguments
450/// * `record` - The consensus read to filter
451/// * `cc_thresholds` - Thresholds for final consensus
452/// * `ab_thresholds` - Thresholds for AB strand
453/// * `ba_thresholds` - Thresholds for BA strand
454///
455/// # Returns
456/// `FilterResult` indicating whether the read passed or why it failed
457///
458/// # Errors
459///
460/// Returns an error if the record data cannot be read.
461pub fn filter_duplex_read(
462    record: &RecordBuf,
463    cc_thresholds: &FilterThresholds,
464    ab_thresholds: &FilterThresholds,
465    ba_thresholds: &FilterThresholds,
466) -> Result<FilterResult> {
467    let data = record.data();
468
469    // First check final consensus thresholds
470    let result = filter_read(record, cc_thresholds)?;
471    if result != FilterResult::Pass {
472        return Ok(result);
473    }
474
475    // Extract AB and BA depths and error rates
476    let ab_depth = extract_int_tag(data, "aD").or_else(|| extract_int_tag(data, "aM"));
477    let ba_depth = extract_int_tag(data, "bD").or_else(|| extract_int_tag(data, "bM"));
478    let ab_error = extract_float_tag(data, "aE");
479    let ba_error = extract_float_tag(data, "bE");
480
481    // Sort depths and errors to identify which is AB (higher) and BA (lower)
482    // Following Scala: val Seq(baMaxDepth, abMaxDepth) = Seq(...).sorted
483    let (min_depth, max_depth) = match (ab_depth, ba_depth) {
484        (Some(a), Some(b)) => {
485            if a < b {
486                (a, b)
487            } else {
488                (b, a)
489            }
490        }
491        (Some(a), None) => (0, a),
492        (None, Some(b)) => (0, b),
493        (None, None) => return Ok(FilterResult::Pass), // No AB/BA tags, not duplex
494    };
495
496    let (min_error, max_error) = match (ab_error, ba_error) {
497        (Some(a), Some(b)) => {
498            if a < b {
499                (a, b)
500            } else {
501                (b, a)
502            }
503        }
504        (Some(a), None) => (a, a),
505        (None, Some(b)) => (b, b),
506        (None, None) => (0.0, 0.0),
507    };
508
509    // Check AB strand (max depth, min error) against AB thresholds
510    // Note: Promote f32 to f64 for comparison (matching fgbio's Float->Double promotion)
511    #[expect(clippy::cast_sign_loss, reason = "depth values are non-negative in BAM tags")]
512    if (max_depth as usize) < ab_thresholds.min_reads {
513        return Ok(FilterResult::InsufficientReads);
514    }
515    if f64::from(min_error) > ab_thresholds.max_read_error_rate {
516        return Ok(FilterResult::ExcessiveErrorRate);
517    }
518
519    // Check BA strand (min depth, max error) against BA thresholds
520    #[expect(clippy::cast_sign_loss, reason = "depth values are non-negative in BAM tags")]
521    if (min_depth as usize) < ba_thresholds.min_reads {
522        return Ok(FilterResult::InsufficientReads);
523    }
524    if f64::from(max_error) > ba_thresholds.max_read_error_rate {
525        return Ok(FilterResult::ExcessiveErrorRate);
526    }
527
528    Ok(FilterResult::Pass)
529}
530
531/// Helper to extract per-base array tag
532fn extract_per_base_array(
533    data: &noodles::sam::alignment::record_buf::Data,
534    tag_str: &str,
535    len: usize,
536) -> Vec<u16> {
537    let tag = per_base::tag(tag_str);
538    if let Some(Value::Array(arr)) = data.get(&tag) {
539        use noodles::sam::alignment::record_buf::data::field::value::Array;
540        match arr {
541            #[expect(clippy::cast_sign_loss, reason = "clamped to non-negative via .max(0)")]
542            Array::Int16(values) => values.iter().map(|&v| v.max(0) as u16).collect(),
543            Array::UInt16(values) => values.clone(),
544            Array::UInt8(values) => values.iter().map(|&v| u16::from(v)).collect(),
545            _ => vec![0; len],
546        }
547    } else {
548        vec![0; len]
549    }
550}
551
552/// Helper to extract consensus base string tag (ac/bc) for single-strand agreement checking
553fn extract_consensus_bases(
554    data: &noodles::sam::alignment::record_buf::Data,
555    tag_str: &str,
556    len: usize,
557) -> Vec<u8> {
558    let tag = per_base::tag(tag_str);
559    if let Some(value) = data.get(&tag) {
560        match value {
561            Value::String(s) => {
562                let bytes: &[u8] = s.as_ref();
563                bytes.to_vec()
564            }
565            Value::Array(arr) => {
566                use noodles::sam::alignment::record_buf::data::field::value::Array;
567                match arr {
568                    Array::UInt8(bytes) => bytes.clone(),
569                    _ => vec![NO_CALL_BASE; len],
570                }
571            }
572            _ => vec![NO_CALL_BASE; len],
573        }
574    } else {
575        vec![NO_CALL_BASE; len]
576    }
577}
578
579/// Masks bases in a consensus read based on per-base tags and thresholds
580///
581/// # Arguments
582/// * `record` - The consensus read to mask (will be modified in place)
583/// * `thresholds` - The filter thresholds to apply
584/// * `min_base_quality` - Minimum base quality to keep (None means no quality masking)
585///
586/// # Returns
587/// Number of bases masked
588///
589/// # Errors
590///
591/// Returns an error if the record data cannot be read or modified.
592pub fn mask_bases(
593    record: &mut RecordBuf,
594    thresholds: &FilterThresholds,
595    min_base_quality: Option<u8>,
596) -> Result<usize> {
597    // Collect sequence and quality scores directly into mutable vectors (no clone needed)
598    let mut new_seq: Vec<u8> = record.sequence().iter().collect();
599    let mut new_quals: Vec<u8> = record.quality_scores().iter().collect();
600    let mut masked_count = 0;
601    let len = new_seq.len();
602
603    // Get per-base depth and error counts using helper
604    let data = record.data();
605    let depths = extract_per_base_array(data, "cd", len);
606    let errors = extract_per_base_array(data, "ce", len);
607
608    // Mask bases that don't meet thresholds
609    for i in 0..len {
610        let should_mask =
611            // Quality too low (only check if min_base_quality is specified)
612            min_base_quality.is_some_and(|min_qual| new_quals[i] < min_qual) ||
613            // Depth too low
614            (depths[i] as usize) < thresholds.min_reads ||
615            // Error rate too high
616            (depths[i] > 0 && (f64::from(errors[i]) / f64::from(depths[i])) > thresholds.max_base_error_rate);
617
618        if should_mask {
619            // Only count as newly masked if not already N
620            if new_seq[i] != NO_CALL_BASE {
621                masked_count += 1;
622            }
623            new_seq[i] = NO_CALL_BASE;
624            new_quals[i] = MIN_PHRED;
625        }
626    }
627
628    // Update the record with masked bases
629    *record.sequence_mut() = Sequence::from(new_seq);
630    *record.quality_scores_mut() = QualityScores::from(new_quals);
631
632    Ok(masked_count)
633}
634
635/// Masks bases in a duplex consensus read based on per-base AB/BA tags and thresholds
636///
637/// This function implements the duplex per-base filtering logic from FilterConsensusReads.scala
638/// (DuplexConsensusPerBaseValues.maskBaseAt, lines 422-435).
639///
640/// For each base position, checks:
641/// - Total depth (AB + BA) against `cc_thresholds`
642/// - Total error rate against `cc_thresholds`
643/// - AB depth (max of two strands) against `ab_thresholds`
644/// - AB error rate (min of two strands) against `ab_thresholds`
645/// - BA depth (min of two strands) against `ba_thresholds`
646/// - BA error rate (max of two strands) against `ba_thresholds`
647/// - Optionally, single-strand agreement
648///
649/// # Arguments
650/// * `record` - The consensus read to mask (will be modified in place)
651/// * `cc_thresholds` - Filter thresholds for final consensus
652/// * `ab_thresholds` - Filter thresholds for AB strand
653/// * `ba_thresholds` - Filter thresholds for BA strand
654/// * `min_base_quality` - Minimum base quality to keep (None means no quality masking)
655/// * `require_ss_agreement` - If true, mask bases where AB and BA disagree
656///
657/// # Returns
658/// Number of bases masked
659///
660/// # Errors
661///
662/// Returns an error if the record data cannot be read or modified.
663pub fn mask_duplex_bases(
664    record: &mut RecordBuf,
665    cc_thresholds: &FilterThresholds,
666    ab_thresholds: &FilterThresholds,
667    ba_thresholds: &FilterThresholds,
668    min_base_quality: Option<u8>,
669    require_ss_agreement: bool,
670) -> Result<usize> {
671    // Collect sequence and quality scores directly into mutable vectors (no clone needed)
672    let mut new_seq: Vec<u8> = record.sequence().iter().collect();
673    let mut new_quals: Vec<u8> = record.quality_scores().iter().collect();
674    let len = new_seq.len();
675    let mut masked_count = 0;
676
677    let data = record.data();
678
679    // Extract AB per-base tags (ad, ae)
680    let ab_depths = extract_per_base_array(data, "ad", len);
681    let ab_errors = extract_per_base_array(data, "ae", len);
682
683    // Extract BA per-base tags (bd, be)
684    let ba_depths = extract_per_base_array(data, "bd", len);
685    let ba_errors = extract_per_base_array(data, "be", len);
686
687    // Extract consensus bases for single-strand agreement checking (ac, bc)
688    let ab_bases = if require_ss_agreement {
689        extract_consensus_bases(data, "ac", len)
690    } else {
691        vec![NO_CALL_BASE; len]
692    };
693    let ba_bases = if require_ss_agreement {
694        extract_consensus_bases(data, "bc", len)
695    } else {
696        vec![NO_CALL_BASE; len]
697    };
698
699    // Mask bases that don't meet thresholds
700    for i in 0..len {
701        if new_seq[i] == NO_CALL_BASE {
702            continue;
703        }
704
705        // Following Scala's maskBaseAt logic (lines 422-435)
706        let ab_depth_i = ab_depths[i];
707        let ba_depth_i = ba_depths[i];
708        let ab_error_i = ab_errors[i];
709        let ba_error_i = ba_errors[i];
710
711        // Sort to get max/min depths and errors
712        let max_depth = std::cmp::max(ab_depth_i, ba_depth_i);
713        let min_depth = std::cmp::min(ab_depth_i, ba_depth_i);
714
715        // For error rates, we need to calculate them first, then sort
716        let ab_error_rate =
717            if ab_depth_i > 0 { f64::from(ab_error_i) / f64::from(ab_depth_i) } else { 0.0 };
718        let ba_error_rate =
719            if ba_depth_i > 0 { f64::from(ba_error_i) / f64::from(ba_depth_i) } else { 0.0 };
720
721        let min_error_rate = ab_error_rate.min(ba_error_rate);
722        let max_error_rate = ab_error_rate.max(ba_error_rate);
723
724        // Total depth and error for final consensus check
725        let total_depth = ab_depth_i + ba_depth_i;
726        let total_error_rate = if total_depth > 0 {
727            f64::from(ab_error_i + ba_error_i) / f64::from(total_depth)
728        } else {
729            0.0
730        };
731
732        // Check if base should be masked
733        let should_mask = min_base_quality.is_some_and(|min_qual| new_quals[i] < min_qual)
734            || (total_depth as usize) < cc_thresholds.min_reads
735            || total_error_rate > cc_thresholds.max_base_error_rate
736            || (max_depth as usize) < ab_thresholds.min_reads
737            || min_error_rate > ab_thresholds.max_base_error_rate
738            || (min_depth as usize) < ba_thresholds.min_reads
739            || max_error_rate > ba_thresholds.max_base_error_rate;
740
741        // Check single-strand agreement if requested
742        let ss_disagree =
743            require_ss_agreement && ab_depth_i > 0 && ba_depth_i > 0 && ab_bases[i] != ba_bases[i];
744
745        if should_mask || ss_disagree {
746            // Only count as newly masked if not already N
747            if new_seq[i] != NO_CALL_BASE {
748                masked_count += 1;
749            }
750            new_seq[i] = NO_CALL_BASE;
751            new_quals[i] = MIN_PHRED;
752        }
753    }
754
755    // Update the record with masked bases
756    *record.sequence_mut() = Sequence::from(new_seq);
757    *record.quality_scores_mut() = QualityScores::from(new_quals);
758
759    Ok(masked_count)
760}
761
762/// Counts the number of N bases in a read
763#[must_use]
764pub fn count_no_calls(record: &RecordBuf) -> usize {
765    record.sequence().iter().filter(|b| *b == NO_CALL_BASE).count()
766}
767
768/// Calculates the mean base quality of non-N bases
769#[must_use]
770pub fn mean_base_quality(record: &RecordBuf) -> f64 {
771    // Use iterators directly to avoid allocating vectors
772    let (sum, count) = record
773        .sequence()
774        .iter()
775        .zip(record.quality_scores().iter())
776        .filter(|&(base, _)| base != NO_CALL_BASE)
777        .fold((0u64, 0usize), |(sum, count), (_, qual)| (sum + u64::from(qual), count + 1));
778
779    #[expect(
780        clippy::cast_precision_loss,
781        reason = "precision loss is acceptable for quality averaging"
782    )]
783    if count == 0 { 0.0 } else { sum as f64 / count as f64 }
784}
785
786/// Computes both no-call count and mean base quality in a single pass.
787///
788/// This is more efficient than calling `count_no_calls()` and `mean_base_quality()`
789/// separately when both values are needed, as it only iterates over the sequence once.
790///
791/// # Returns
792/// A tuple of (`no_call_count`, `mean_base_quality`)
793#[must_use]
794pub fn compute_read_stats(record: &RecordBuf) -> (usize, f64) {
795    let (qual_sum, non_n_count, n_count) = record
796        .sequence()
797        .iter()
798        .zip(record.quality_scores().iter())
799        .fold((0u64, 0usize, 0usize), |(sum, non_n, n), (base, qual)| {
800            if base == NO_CALL_BASE {
801                (sum, non_n, n + 1)
802            } else {
803                (sum + u64::from(qual), non_n + 1, n)
804            }
805        });
806
807    #[expect(
808        clippy::cast_precision_loss,
809        reason = "precision loss is acceptable for quality averaging"
810    )]
811    let mean_qual = if non_n_count == 0 { 0.0 } else { qual_sum as f64 / non_n_count as f64 };
812    (n_count, mean_qual)
813}
814
815/// Checks if all reads in a template pass filtering
816///
817/// For template-aware filtering, all primary reads (R1/R2) must pass for the template to pass.
818/// Secondary and supplementary alignments are allowed to fail independently.
819///
820/// # Arguments
821/// * `template_records` - All records with the same read name
822/// * `pass_map` - Map of record index to pass/fail status
823///
824/// # Returns
825/// true if all primary reads pass, false otherwise
826#[must_use]
827pub fn template_passes(template_records: &[RecordBuf], pass_map: &AHashMap<usize, bool>) -> bool {
828    // Find primary reads (non-secondary, non-supplementary)
829    let mut has_primary = false;
830    let mut all_primary_pass = true;
831
832    for (idx, record) in template_records.iter().enumerate() {
833        let flags = record.flags();
834        let is_primary = !flags.is_secondary() && !flags.is_supplementary();
835
836        if is_primary {
837            has_primary = true;
838            if let Some(&passes) = pass_map.get(&idx) {
839                if !passes {
840                    all_primary_pass = false;
841                    break;
842                }
843            } else {
844                // If not in map, assume it failed
845                all_primary_pass = false;
846                break;
847            }
848        }
849    }
850
851    // If no primary reads found, template fails
852    // If all primary reads pass, template passes
853    has_primary && all_primary_pass
854}
855
856/// Checks whether all primary raw records in a template pass their filters.
857///
858/// A template passes if it has at least one primary read and all primary reads pass.
859#[must_use]
860pub fn template_passes_raw(raw_records: &[Vec<u8>], pass_map: &AHashMap<usize, bool>) -> bool {
861    let mut has_primary = false;
862    let mut all_primary_pass = true;
863
864    for (idx, record) in raw_records.iter().enumerate() {
865        let flags = bam_fields::flags(record);
866        let is_primary = (flags & bam_fields::flags::SECONDARY) == 0
867            && (flags & bam_fields::flags::SUPPLEMENTARY) == 0;
868
869        if is_primary {
870            has_primary = true;
871            if let Some(&passes) = pass_map.get(&idx) {
872                if !passes {
873                    all_primary_pass = false;
874                    break;
875                }
876            } else {
877                all_primary_pass = false;
878                break;
879            }
880        }
881    }
882
883    has_primary && all_primary_pass
884}
885
886/// Detects if a record is part of a duplex consensus
887///
888/// Checks for presence of AB/BA specific tags (aD, bD)
889///
890/// # Arguments
891/// * `record` - The record to check
892///
893/// # Returns
894/// true if duplex tags are present, false otherwise
895#[must_use]
896pub fn is_duplex_consensus(record: &RecordBuf) -> bool {
897    let ad_tag = per_read::tag("aD");
898    let bd_tag = per_read::tag("bD");
899
900    record.data().get(&ad_tag).is_some() || record.data().get(&bd_tag).is_some()
901}
902
903// ============================================================================
904// Raw-byte equivalents (operate on &[u8] / &mut Vec<u8>)
905// ============================================================================
906
907use fgumi_raw_bam as bam_fields;
908
909/// Detects if a raw BAM record is a duplex consensus.
910///
911/// Checks for presence of `aD` or `bD` tags in the aux data.
912#[must_use]
913pub fn is_duplex_consensus_raw(aux_data: &[u8]) -> bool {
914    bam_fields::find_tag_type(aux_data, b"aD").is_some()
915        || bam_fields::find_tag_type(aux_data, b"bD").is_some()
916}
917
918/// Filters a raw consensus read based on per-read tags (cD depth, cE error rate).
919///
920/// # Errors
921///
922/// Returns an error if the aux data cannot be parsed.
923pub fn filter_read_raw(aux_data: &[u8], thresholds: &FilterThresholds) -> Result<FilterResult> {
924    // Check minimum reads (cD tag — UInt8)
925    if let Some(depth) = bam_fields::find_uint8_tag(aux_data, b"cD") {
926        if (depth as usize) < thresholds.min_reads {
927            return Ok(FilterResult::InsufficientReads);
928        }
929    }
930
931    // Check maximum error rate (cE tag — Float)
932    if let Some(error_rate) = bam_fields::find_float_tag(aux_data, b"cE") {
933        if f64::from(error_rate) > thresholds.max_read_error_rate {
934            return Ok(FilterResult::ExcessiveErrorRate);
935        }
936    }
937
938    Ok(FilterResult::Pass)
939}
940
941/// Filters a raw duplex consensus read, checking CC / AB / BA thresholds.
942///
943/// # Errors
944///
945/// Returns an error if the aux data cannot be parsed.
946pub fn filter_duplex_read_raw(
947    aux_data: &[u8],
948    cc_thresholds: &FilterThresholds,
949    ab_thresholds: &FilterThresholds,
950    ba_thresholds: &FilterThresholds,
951) -> Result<FilterResult> {
952    // First check final consensus thresholds
953    let result = filter_read_raw(aux_data, cc_thresholds)?;
954    if result != FilterResult::Pass {
955        return Ok(result);
956    }
957
958    // Extract AB and BA depths and error rates
959    let ab_depth = bam_fields::find_int_tag(aux_data, b"aD")
960        .or_else(|| bam_fields::find_int_tag(aux_data, b"aM"));
961    let ba_depth = bam_fields::find_int_tag(aux_data, b"bD")
962        .or_else(|| bam_fields::find_int_tag(aux_data, b"bM"));
963    let ab_error = bam_fields::find_float_tag(aux_data, b"aE");
964    let ba_error = bam_fields::find_float_tag(aux_data, b"bE");
965
966    // Sort depths to identify AB (higher) and BA (lower)
967    let (min_depth, max_depth) = match (ab_depth, ba_depth) {
968        (Some(a), Some(b)) => {
969            if a < b {
970                (a, b)
971            } else {
972                (b, a)
973            }
974        }
975        (Some(a), None) => (0, a),
976        (None, Some(b)) => (0, b),
977        (None, None) => return Ok(FilterResult::Pass),
978    };
979
980    let (min_error, max_error) = match (ab_error, ba_error) {
981        (Some(a), Some(b)) => {
982            if a < b {
983                (a, b)
984            } else {
985                (b, a)
986            }
987        }
988        (Some(a), None) => (a, a),
989        (None, Some(b)) => (b, b),
990        (None, None) => (0.0, 0.0),
991    };
992
993    // Check AB strand (max depth, min error) against AB thresholds
994    #[expect(
995        clippy::cast_sign_loss,
996        clippy::cast_possible_truncation,
997        reason = "depth values are non-negative and fit in usize on all supported platforms"
998    )]
999    if (max_depth as usize) < ab_thresholds.min_reads {
1000        return Ok(FilterResult::InsufficientReads);
1001    }
1002    if f64::from(min_error) > ab_thresholds.max_read_error_rate {
1003        return Ok(FilterResult::ExcessiveErrorRate);
1004    }
1005
1006    // Check BA strand (min depth, max error) against BA thresholds
1007    #[expect(
1008        clippy::cast_sign_loss,
1009        clippy::cast_possible_truncation,
1010        reason = "depth values are non-negative and fit in usize on all supported platforms"
1011    )]
1012    if (min_depth as usize) < ba_thresholds.min_reads {
1013        return Ok(FilterResult::InsufficientReads);
1014    }
1015    if f64::from(max_error) > ba_thresholds.max_read_error_rate {
1016        return Ok(FilterResult::ExcessiveErrorRate);
1017    }
1018
1019    Ok(FilterResult::Pass)
1020}
1021
1022/// Computes both no-call count and mean base quality in a single pass over raw BAM bytes.
1023///
1024/// Returns (`no_call_count`, `mean_base_quality`).
1025///
1026/// # Panics
1027/// Panics if the record is shorter than `MIN_BAM_RECORD_LEN` (36 bytes).
1028#[must_use]
1029pub fn compute_read_stats_raw(bam: &[u8]) -> (usize, f64) {
1030    assert!(bam.len() >= bam_fields::MIN_BAM_RECORD_LEN, "BAM record too short");
1031    let seq_off = bam_fields::seq_offset(bam);
1032    let qual_off = bam_fields::qual_offset(bam);
1033    let len = bam_fields::l_seq(bam) as usize;
1034
1035    let mut n_count = 0usize;
1036    let mut qual_sum = 0u64;
1037    let mut non_n_count = 0usize;
1038
1039    for i in 0..len {
1040        if bam_fields::is_base_n(bam, seq_off, i) {
1041            n_count += 1;
1042        } else {
1043            qual_sum += u64::from(bam_fields::get_qual(bam, qual_off, i));
1044            non_n_count += 1;
1045        }
1046    }
1047
1048    #[expect(
1049        clippy::cast_precision_loss,
1050        reason = "precision loss is acceptable for quality averaging"
1051    )]
1052    let mean_qual = if non_n_count == 0 { 0.0 } else { qual_sum as f64 / non_n_count as f64 };
1053    (n_count, mean_qual)
1054}
1055
1056/// Reads a tag value as either a Z-type string or a B-type `UInt8` array.
1057///
1058/// Returns `Some(Vec<u8>)` if found as either type, `None` otherwise.
1059fn find_string_or_uint8_array(aux_data: &[u8], tag: [u8; 2]) -> Option<Vec<u8>> {
1060    if let Some(s) = bam_fields::find_string_tag(aux_data, &tag) {
1061        Some(s.to_vec())
1062    } else {
1063        let arr = bam_fields::find_array_tag(aux_data, &tag)?;
1064        if matches!(arr.elem_type, b'C' | b'c') {
1065            #[expect(
1066                clippy::cast_possible_truncation,
1067                reason = "UInt8 array elements are guaranteed to fit in u8"
1068            )]
1069            Some((0..arr.count).map(|i| bam_fields::array_tag_element_u16(&arr, i) as u8).collect())
1070        } else {
1071            None
1072        }
1073    }
1074}
1075
1076/// Masks bases in a raw consensus read based on per-base tags and thresholds.
1077///
1078/// Modifies sequence and quality bytes in-place (no Vec allocation for the seq/qual data).
1079/// Returns the number of newly masked bases.
1080///
1081/// # Errors
1082///
1083/// Returns an error if the record is too short or the aux data cannot be parsed.
1084#[expect(
1085    clippy::similar_names,
1086    reason = "threshold variable names mirror the consensus tag names they check"
1087)]
1088pub fn mask_bases_raw(
1089    record: &mut [u8],
1090    thresholds: &FilterThresholds,
1091    min_base_quality: Option<u8>,
1092) -> Result<usize> {
1093    anyhow::ensure!(record.len() >= bam_fields::MIN_BAM_RECORD_LEN, "BAM record too short");
1094    let seq_off = bam_fields::seq_offset(record);
1095    let qual_off = bam_fields::qual_offset(record);
1096    let len = bam_fields::l_seq(record) as usize;
1097    let aux_off = bam_fields::aux_data_offset_from_record(record).unwrap_or(record.len());
1098
1099    // Pre-read per-base arrays into owned Vecs to release the immutable borrow on record
1100    let cd_vals = bam_fields::find_array_tag(&record[aux_off..], b"cd")
1101        .map(|r| bam_fields::array_tag_to_vec_u16(&r));
1102    let ce_vals = bam_fields::find_array_tag(&record[aux_off..], b"ce")
1103        .map(|r| bam_fields::array_tag_to_vec_u16(&r));
1104
1105    let mut masked_count = 0;
1106    for i in 0..len {
1107        let depth = cd_vals.as_ref().map_or(0u16, |v| v.get(i).copied().unwrap_or(0));
1108        let errors = ce_vals.as_ref().map_or(0u16, |v| v.get(i).copied().unwrap_or(0));
1109        let qual = bam_fields::get_qual(record, qual_off, i);
1110
1111        let should_mask = min_base_quality.is_some_and(|min_qual| qual < min_qual)
1112            || (depth as usize) < thresholds.min_reads
1113            || (depth > 0
1114                && (f64::from(errors) / f64::from(depth)) > thresholds.max_base_error_rate);
1115
1116        if should_mask {
1117            // Only count as newly masked if not already N
1118            if !bam_fields::is_base_n(record, seq_off, i) {
1119                masked_count += 1;
1120            }
1121            bam_fields::mask_base(record, seq_off, i);
1122            bam_fields::set_qual(record, qual_off, i, MIN_PHRED);
1123        }
1124    }
1125
1126    Ok(masked_count)
1127}
1128
1129/// Masks bases in a raw duplex consensus read based on per-base AB/BA tags and thresholds.
1130///
1131/// Returns the number of newly masked bases.
1132///
1133/// # Errors
1134///
1135/// Returns an error if the record is too short or the aux data cannot be parsed.
1136#[expect(
1137    clippy::similar_names,
1138    reason = "threshold variable names mirror the duplex strand tag names"
1139)]
1140pub fn mask_duplex_bases_raw(
1141    record: &mut [u8],
1142    cc_thresholds: &FilterThresholds,
1143    ab_thresholds: &FilterThresholds,
1144    ba_thresholds: &FilterThresholds,
1145    min_base_quality: Option<u8>,
1146    require_ss_agreement: bool,
1147) -> Result<usize> {
1148    anyhow::ensure!(record.len() >= bam_fields::MIN_BAM_RECORD_LEN, "BAM record too short");
1149    let seq_off = bam_fields::seq_offset(record);
1150    let qual_off = bam_fields::qual_offset(record);
1151    let len = bam_fields::l_seq(record) as usize;
1152    let aux_off = bam_fields::aux_data_offset_from_record(record).unwrap_or(record.len());
1153
1154    // Pre-read per-base arrays and strings into owned data to release the immutable borrow
1155    let ad_vals = bam_fields::find_array_tag(&record[aux_off..], b"ad")
1156        .map(|r| bam_fields::array_tag_to_vec_u16(&r));
1157    let ae_vals = bam_fields::find_array_tag(&record[aux_off..], b"ae")
1158        .map(|r| bam_fields::array_tag_to_vec_u16(&r));
1159    let bd_vals = bam_fields::find_array_tag(&record[aux_off..], b"bd")
1160        .map(|r| bam_fields::array_tag_to_vec_u16(&r));
1161    let be_vals = bam_fields::find_array_tag(&record[aux_off..], b"be")
1162        .map(|r| bam_fields::array_tag_to_vec_u16(&r));
1163
1164    // For single-strand agreement checking, get ac/bc tags (copy to owned).
1165    // These may be Z-type strings or B-type UInt8 arrays.
1166    let ac_owned: Option<Vec<u8>> = if require_ss_agreement {
1167        find_string_or_uint8_array(&record[aux_off..], *b"ac")
1168    } else {
1169        None
1170    };
1171    let bc_owned: Option<Vec<u8>> = if require_ss_agreement {
1172        find_string_or_uint8_array(&record[aux_off..], *b"bc")
1173    } else {
1174        None
1175    };
1176
1177    let mut masked_count = 0;
1178    for i in 0..len {
1179        // Skip if already N
1180        if bam_fields::is_base_n(record, seq_off, i) {
1181            continue;
1182        }
1183
1184        let ab_depth = ad_vals.as_ref().map_or(0u16, |v| v.get(i).copied().unwrap_or(0));
1185        let ba_depth = bd_vals.as_ref().map_or(0u16, |v| v.get(i).copied().unwrap_or(0));
1186        let ab_errors = ae_vals.as_ref().map_or(0u16, |v| v.get(i).copied().unwrap_or(0));
1187        let ba_errors = be_vals.as_ref().map_or(0u16, |v| v.get(i).copied().unwrap_or(0));
1188
1189        let max_depth = std::cmp::max(ab_depth, ba_depth);
1190        let min_depth = std::cmp::min(ab_depth, ba_depth);
1191
1192        let ab_error_rate =
1193            if ab_depth > 0 { f64::from(ab_errors) / f64::from(ab_depth) } else { 0.0 };
1194        let ba_error_rate =
1195            if ba_depth > 0 { f64::from(ba_errors) / f64::from(ba_depth) } else { 0.0 };
1196
1197        let min_error_rate = ab_error_rate.min(ba_error_rate);
1198        let max_error_rate = ab_error_rate.max(ba_error_rate);
1199
1200        let total_depth = u32::from(ab_depth) + u32::from(ba_depth);
1201        let total_error_rate = if total_depth > 0 {
1202            f64::from(u32::from(ab_errors) + u32::from(ba_errors)) / f64::from(total_depth)
1203        } else {
1204            0.0
1205        };
1206
1207        let qual = bam_fields::get_qual(record, qual_off, i);
1208
1209        let should_mask = min_base_quality.is_some_and(|min_qual| qual < min_qual)
1210            || (total_depth as usize) < cc_thresholds.min_reads
1211            || total_error_rate > cc_thresholds.max_base_error_rate
1212            || (max_depth as usize) < ab_thresholds.min_reads
1213            || min_error_rate > ab_thresholds.max_base_error_rate
1214            || (min_depth as usize) < ba_thresholds.min_reads
1215            || max_error_rate > ba_thresholds.max_base_error_rate;
1216
1217        // Check single-strand agreement if requested.
1218        // Use NO_CALL_BASE as default for missing/short tags, matching RecordBuf path behavior.
1219        let ss_disagree = require_ss_agreement && ab_depth > 0 && ba_depth > 0 && {
1220            let ac_base =
1221                ac_owned.as_ref().and_then(|ac| ac.get(i).copied()).unwrap_or(NO_CALL_BASE);
1222            let bc_base =
1223                bc_owned.as_ref().and_then(|bc| bc.get(i).copied()).unwrap_or(NO_CALL_BASE);
1224            ac_base != bc_base
1225        };
1226
1227        if should_mask || ss_disagree {
1228            masked_count += 1;
1229            bam_fields::mask_base(record, seq_off, i);
1230            bam_fields::set_qual(record, qual_off, i, MIN_PHRED);
1231        }
1232    }
1233
1234    Ok(masked_count)
1235}
1236
1237#[cfg(test)]
1238mod tests {
1239    use super::*;
1240    use fgumi_sam::builder::RecordBuilder;
1241    use noodles::sam::alignment::record::data::field::Tag;
1242
1243    fn create_test_record() -> RecordBuf {
1244        // Uses RecordBuilder which auto-generates Q30 qualities
1245        RecordBuilder::new().sequence("ACGTACGT").build()
1246    }
1247
1248    #[test]
1249    fn test_filter_result_to_rejection_reason() {
1250        assert_eq!(FilterResult::Pass.to_rejection_reason(), None);
1251        assert_eq!(
1252            FilterResult::InsufficientReads.to_rejection_reason(),
1253            Some(RejectionReason::InsufficientSupport)
1254        );
1255        assert_eq!(
1256            FilterResult::ExcessiveErrorRate.to_rejection_reason(),
1257            Some(RejectionReason::ExcessiveErrorRate)
1258        );
1259        assert_eq!(
1260            FilterResult::LowQuality.to_rejection_reason(),
1261            Some(RejectionReason::LowMeanQuality)
1262        );
1263        assert_eq!(
1264            FilterResult::TooManyNoCalls.to_rejection_reason(),
1265            Some(RejectionReason::ExcessiveNBases)
1266        );
1267    }
1268
1269    #[test]
1270    fn test_filter_config_single_value() {
1271        let config = FilterConfig::new(&[3], &[0.1], &[0.2], Some(13), None, 0.1);
1272
1273        // All threshold types are now always created (matching fgbio behavior)
1274        assert!(config.single_strand_thresholds.is_some());
1275        assert!(config.duplex_thresholds.is_some());
1276
1277        let thresh = config.single_strand_thresholds.unwrap();
1278        assert_eq!(thresh.min_reads, 3);
1279        assert!((thresh.max_read_error_rate - 0.1).abs() < f64::EPSILON);
1280        assert!((thresh.max_base_error_rate - 0.2).abs() < f64::EPSILON);
1281    }
1282
1283    #[test]
1284    fn test_filter_config_three_values() {
1285        let config =
1286            FilterConfig::new(&[5, 3, 3], &[0.05, 0.1, 0.1], &[0.1, 0.2, 0.2], Some(13), None, 0.1);
1287
1288        // All threshold types are now always created (matching fgbio behavior)
1289        assert!(config.duplex_thresholds.is_some());
1290        assert!(config.ab_thresholds.is_some());
1291        assert!(config.ba_thresholds.is_some());
1292        assert!(config.single_strand_thresholds.is_some());
1293
1294        let duplex = config.duplex_thresholds.unwrap();
1295        assert_eq!(duplex.min_reads, 5);
1296        assert!((duplex.max_read_error_rate - 0.05).abs() < f64::EPSILON);
1297
1298        let ab = config.ab_thresholds.unwrap();
1299        assert_eq!(ab.min_reads, 3);
1300        assert!((ab.max_read_error_rate - 0.1).abs() < f64::EPSILON);
1301    }
1302
1303    #[test]
1304    fn test_filter_config_valid_threshold_ordering() {
1305        // Valid: CC=10 >= AB=5 >= BA=3 for min_reads
1306        // Valid: AB=0.05 <= BA=0.1 for error rates (AB more stringent)
1307        let config = FilterConfig::new(
1308            &[10, 5, 3],
1309            &[0.02, 0.05, 0.1],
1310            &[0.05, 0.1, 0.2],
1311            Some(13),
1312            None,
1313            0.1,
1314        );
1315
1316        let cc = config.duplex_thresholds.unwrap();
1317        let ab = config.ab_thresholds.unwrap();
1318        let ba = config.ba_thresholds.unwrap();
1319
1320        assert_eq!(cc.min_reads, 10);
1321        assert_eq!(ab.min_reads, 5);
1322        assert_eq!(ba.min_reads, 3);
1323    }
1324
1325    // ========== Tests for named constructors ==========
1326
1327    #[test]
1328    fn test_filter_config_for_single_strand() {
1329        let thresholds =
1330            FilterThresholds { min_reads: 3, max_read_error_rate: 0.1, max_base_error_rate: 0.2 };
1331
1332        let config = FilterConfig::for_single_strand(thresholds.clone(), Some(13), None, 0.1);
1333
1334        // All threshold types should use the same values
1335        let ss = config.single_strand_thresholds.unwrap();
1336        assert_eq!(ss.min_reads, 3);
1337        assert!((ss.max_read_error_rate - 0.1).abs() < f64::EPSILON);
1338        assert!((ss.max_base_error_rate - 0.2).abs() < f64::EPSILON);
1339
1340        // Duplex, AB, BA should also be set (for compatibility)
1341        assert!(config.duplex_thresholds.is_some());
1342        assert!(config.ab_thresholds.is_some());
1343        assert!(config.ba_thresholds.is_some());
1344    }
1345
1346    #[test]
1347    fn test_filter_config_for_duplex_symmetric() {
1348        let duplex_thresholds =
1349            FilterThresholds { min_reads: 10, max_read_error_rate: 0.05, max_base_error_rate: 0.1 };
1350        let strand_thresholds =
1351            FilterThresholds { min_reads: 5, max_read_error_rate: 0.1, max_base_error_rate: 0.2 };
1352
1353        let config =
1354            FilterConfig::for_duplex(duplex_thresholds, strand_thresholds, Some(13), None, 0.1);
1355
1356        let cc = config.duplex_thresholds.unwrap();
1357        let ab = config.ab_thresholds.unwrap();
1358        let ba = config.ba_thresholds.unwrap();
1359
1360        assert_eq!(cc.min_reads, 10);
1361        assert_eq!(ab.min_reads, 5);
1362        assert_eq!(ba.min_reads, 5); // Same as AB (symmetric)
1363    }
1364
1365    #[test]
1366    fn test_filter_config_for_duplex_asymmetric() {
1367        let duplex = FilterThresholds {
1368            min_reads: 10,
1369            max_read_error_rate: 0.02,
1370            max_base_error_rate: 0.05,
1371        };
1372        let ab =
1373            FilterThresholds { min_reads: 5, max_read_error_rate: 0.05, max_base_error_rate: 0.1 };
1374        let ba =
1375            FilterThresholds { min_reads: 3, max_read_error_rate: 0.1, max_base_error_rate: 0.2 };
1376
1377        let config =
1378            FilterConfig::for_duplex_asymmetric(duplex, ab, ba, Some(13), Some(20.0), 0.15);
1379
1380        let cc = config.duplex_thresholds.unwrap();
1381        let ab_t = config.ab_thresholds.unwrap();
1382        let ba_t = config.ba_thresholds.unwrap();
1383
1384        assert_eq!(cc.min_reads, 10);
1385        assert_eq!(ab_t.min_reads, 5);
1386        assert_eq!(ba_t.min_reads, 3);
1387
1388        // Verify other config fields
1389        assert_eq!(config.min_base_quality, Some(13));
1390        assert_eq!(config.min_mean_base_quality, Some(20.0));
1391        assert!((config.max_no_call_fraction - 0.15).abs() < f64::EPSILON);
1392    }
1393
1394    #[test]
1395    #[should_panic(expected = "min-reads values must be specified high to low: AB")]
1396    fn test_filter_config_for_duplex_asymmetric_invalid_ab() {
1397        let duplex =
1398            FilterThresholds { min_reads: 5, max_read_error_rate: 0.05, max_base_error_rate: 0.1 };
1399        let ab =
1400            FilterThresholds { min_reads: 10, max_read_error_rate: 0.1, max_base_error_rate: 0.2 };
1401        let ba =
1402            FilterThresholds { min_reads: 3, max_read_error_rate: 0.1, max_base_error_rate: 0.2 };
1403
1404        // AB (10) > duplex (5) - should panic
1405        let _ = FilterConfig::for_duplex_asymmetric(duplex, ab, ba, Some(13), None, 0.1);
1406    }
1407
1408    #[test]
1409    #[should_panic(expected = "min-reads values must be specified high to low: AB")]
1410    fn test_filter_config_invalid_ab_greater_than_cc() {
1411        // Invalid: AB (6) > CC (5) for min_reads
1412        let _ =
1413            FilterConfig::new(&[5, 6, 3], &[0.05, 0.1, 0.1], &[0.1, 0.2, 0.2], Some(13), None, 0.1);
1414    }
1415
1416    #[test]
1417    #[should_panic(expected = "min-reads values must be specified high to low: BA")]
1418    fn test_filter_config_invalid_ba_greater_than_ab() {
1419        // Invalid: BA (4) > AB (3) for min_reads
1420        let _ =
1421            FilterConfig::new(&[5, 3, 4], &[0.05, 0.1, 0.1], &[0.1, 0.2, 0.2], Some(13), None, 0.1);
1422    }
1423
1424    #[test]
1425    #[should_panic(expected = "max-read-error-rate for AB")]
1426    fn test_filter_config_invalid_ab_read_error_rate_greater_than_ba() {
1427        // Invalid: AB read error rate (0.2) > BA read error rate (0.1)
1428        let _ =
1429            FilterConfig::new(&[5, 3, 3], &[0.05, 0.2, 0.1], &[0.1, 0.2, 0.2], Some(13), None, 0.1);
1430    }
1431
1432    #[test]
1433    #[should_panic(expected = "max-base-error-rate for AB")]
1434    fn test_filter_config_invalid_ab_base_error_rate_greater_than_ba() {
1435        // Invalid: AB base error rate (0.3) > BA base error rate (0.2)
1436        let _ =
1437            FilterConfig::new(&[5, 3, 3], &[0.05, 0.1, 0.1], &[0.1, 0.3, 0.2], Some(13), None, 0.1);
1438    }
1439
1440    #[test]
1441    fn test_count_no_calls() {
1442        let mut record = create_test_record();
1443        *record.sequence_mut() = Sequence::from(b"ACNNGCNN".to_vec());
1444
1445        assert_eq!(count_no_calls(&record), 4);
1446    }
1447
1448    #[test]
1449    fn test_mean_base_quality() {
1450        let mut record = create_test_record();
1451        *record.quality_scores_mut() = QualityScores::from(vec![10, 20, 30, 40, 10, 20, 30, 40]);
1452
1453        let mean = mean_base_quality(&record);
1454        assert!((mean - 25.0).abs() < f64::EPSILON); // (10+20+30+40+10+20+30+40) / 8 = 200/8 = 25
1455    }
1456
1457    #[test]
1458    fn test_mean_base_quality_with_n() {
1459        let mut record = create_test_record();
1460        *record.sequence_mut() = Sequence::from(b"ACNNACGT".to_vec());
1461        *record.quality_scores_mut() = QualityScores::from(vec![10, 20, 0, 0, 30, 40, 10, 20]);
1462
1463        let mean = mean_base_quality(&record);
1464        // Only non-N bases: 10+20+30+40+10+20 = 130 / 6 = 21.666...
1465        assert!((mean - 21.666).abs() < 0.01);
1466    }
1467
1468    #[test]
1469    fn test_compute_read_stats() {
1470        let mut record = create_test_record();
1471        *record.sequence_mut() = Sequence::from(b"ACNNACGT".to_vec());
1472        *record.quality_scores_mut() = QualityScores::from(vec![10, 20, 0, 0, 30, 40, 10, 20]);
1473
1474        let (n_count, mean_qual) = compute_read_stats(&record);
1475
1476        // Should match individual functions
1477        assert_eq!(n_count, count_no_calls(&record));
1478        assert!((mean_qual - mean_base_quality(&record)).abs() < f64::EPSILON);
1479
1480        // Verify expected values: 2 Ns, mean of non-N quals = 130/6 = 21.666...
1481        assert_eq!(n_count, 2);
1482        assert!((mean_qual - 21.666).abs() < 0.01);
1483    }
1484
1485    #[test]
1486    fn test_compute_read_stats_all_n() {
1487        let mut record = create_test_record();
1488        *record.sequence_mut() = Sequence::from(b"NNNNNNNN".to_vec());
1489        *record.quality_scores_mut() = QualityScores::from(vec![0; 8]);
1490
1491        let (n_count, mean_qual) = compute_read_stats(&record);
1492
1493        assert_eq!(n_count, 8);
1494        assert!((mean_qual - 0.0).abs() < f64::EPSILON);
1495    }
1496
1497    #[test]
1498    fn test_compute_read_stats_no_n() {
1499        let mut record = create_test_record();
1500        *record.quality_scores_mut() = QualityScores::from(vec![30, 30, 30, 30, 30, 30, 30, 30]);
1501
1502        let (n_count, mean_qual) = compute_read_stats(&record);
1503
1504        assert_eq!(n_count, 0);
1505        assert!((mean_qual - 30.0).abs() < f64::EPSILON);
1506    }
1507
1508    #[test]
1509    fn test_filter_result() {
1510        let thresholds =
1511            FilterThresholds { min_reads: 3, max_read_error_rate: 0.1, max_base_error_rate: 0.2 };
1512
1513        let mut record = create_test_record();
1514
1515        // Add cD tag with sufficient depth
1516        let tag = Tag::from([b'c', b'D']);
1517        record.data_mut().insert(tag, Value::UInt8(5));
1518
1519        let result = filter_read(&record, &thresholds).unwrap();
1520        assert_eq!(result, FilterResult::Pass);
1521    }
1522
1523    #[test]
1524    fn test_filter_insufficient_reads() {
1525        let thresholds =
1526            FilterThresholds { min_reads: 10, max_read_error_rate: 0.1, max_base_error_rate: 0.2 };
1527
1528        let mut record = create_test_record();
1529
1530        // Add cD tag with insufficient depth
1531        let tag = Tag::from([b'c', b'D']);
1532        record.data_mut().insert(tag, Value::UInt8(5));
1533
1534        let result = filter_read(&record, &thresholds).unwrap();
1535        assert_eq!(result, FilterResult::InsufficientReads);
1536    }
1537
1538    #[test]
1539    fn test_template_passes_all_pass() {
1540        use noodles::sam::alignment::record::Flags;
1541
1542        let mut rec1 = create_test_record();
1543        *rec1.flags_mut() = Flags::SEGMENTED | Flags::FIRST_SEGMENT;
1544
1545        let mut rec2 = create_test_record();
1546        *rec2.flags_mut() = Flags::SEGMENTED | Flags::LAST_SEGMENT;
1547
1548        let records = vec![rec1, rec2];
1549        let mut pass_map = AHashMap::new();
1550        pass_map.insert(0, true);
1551        pass_map.insert(1, true);
1552
1553        assert!(template_passes(&records, &pass_map));
1554    }
1555
1556    #[test]
1557    fn test_template_passes_one_fails() {
1558        use noodles::sam::alignment::record::Flags;
1559
1560        let mut rec1 = create_test_record();
1561        *rec1.flags_mut() = Flags::SEGMENTED | Flags::FIRST_SEGMENT;
1562
1563        let mut rec2 = create_test_record();
1564        *rec2.flags_mut() = Flags::SEGMENTED | Flags::LAST_SEGMENT;
1565
1566        let records = vec![rec1, rec2];
1567        let mut pass_map = AHashMap::new();
1568        pass_map.insert(0, true);
1569        pass_map.insert(1, false); // R2 fails
1570
1571        assert!(!template_passes(&records, &pass_map));
1572    }
1573
1574    #[test]
1575    fn test_template_passes_secondary_ignored() {
1576        use noodles::sam::alignment::record::Flags;
1577
1578        let mut rec1 = create_test_record();
1579        *rec1.flags_mut() = Flags::SEGMENTED | Flags::FIRST_SEGMENT;
1580
1581        let mut rec2 = create_test_record();
1582        *rec2.flags_mut() = Flags::SEGMENTED | Flags::FIRST_SEGMENT | Flags::SECONDARY;
1583
1584        let records = vec![rec1, rec2];
1585        let mut pass_map = AHashMap::new();
1586        pass_map.insert(0, true); // Primary passes
1587        pass_map.insert(1, false); // Secondary fails (but should be ignored)
1588
1589        assert!(template_passes(&records, &pass_map));
1590    }
1591
1592    #[test]
1593    fn test_is_duplex_consensus() {
1594        let mut record = create_test_record();
1595
1596        // Not duplex initially
1597        assert!(!is_duplex_consensus(&record));
1598
1599        // Add aD tag
1600        let ad_tag = Tag::from([b'a', b'D']);
1601        record.data_mut().insert(ad_tag, Value::UInt8(3));
1602
1603        // Now it's duplex
1604        assert!(is_duplex_consensus(&record));
1605    }
1606
1607    // ========== Tests for filter_duplex_read() ==========
1608
1609    #[test]
1610    fn test_filter_duplex_read_pass() {
1611        let mut record = create_test_record();
1612
1613        // Add final consensus tags (passing)
1614        record.data_mut().insert(Tag::from([b'c', b'D']), Value::UInt8(10));
1615        record.data_mut().insert(Tag::from([b'c', b'E']), Value::Float(0.01));
1616
1617        // Add AB/BA tags (both passing)
1618        record.data_mut().insert(Tag::from([b'a', b'D']), Value::UInt8(6));
1619        record.data_mut().insert(Tag::from([b'b', b'D']), Value::UInt8(4));
1620        record.data_mut().insert(Tag::from([b'a', b'E']), Value::Float(0.01));
1621        record.data_mut().insert(Tag::from([b'b', b'E']), Value::Float(0.02));
1622
1623        let cc_thresholds =
1624            FilterThresholds { min_reads: 5, max_read_error_rate: 0.05, max_base_error_rate: 0.1 };
1625        let ab_thresholds =
1626            FilterThresholds { min_reads: 3, max_read_error_rate: 0.1, max_base_error_rate: 0.2 };
1627        let ba_thresholds =
1628            FilterThresholds { min_reads: 2, max_read_error_rate: 0.15, max_base_error_rate: 0.25 };
1629
1630        let result =
1631            filter_duplex_read(&record, &cc_thresholds, &ab_thresholds, &ba_thresholds).unwrap();
1632        assert_eq!(result, FilterResult::Pass);
1633    }
1634
1635    #[test]
1636    fn test_filter_duplex_read_insufficient_ab_reads() {
1637        let mut record = create_test_record();
1638
1639        // Add final consensus tags (passing)
1640        record.data_mut().insert(Tag::from([b'c', b'D']), Value::UInt8(10));
1641        record.data_mut().insert(Tag::from([b'c', b'E']), Value::Float(0.01));
1642
1643        // Add AB/BA tags - AB has insufficient reads (max depth = 2 < ab_min_reads = 3)
1644        record.data_mut().insert(Tag::from([b'a', b'D']), Value::UInt8(2));
1645        record.data_mut().insert(Tag::from([b'b', b'D']), Value::UInt8(2));
1646        record.data_mut().insert(Tag::from([b'a', b'E']), Value::Float(0.01));
1647        record.data_mut().insert(Tag::from([b'b', b'E']), Value::Float(0.02));
1648
1649        let cc_thresholds =
1650            FilterThresholds { min_reads: 5, max_read_error_rate: 0.05, max_base_error_rate: 0.1 };
1651        let ab_thresholds =
1652            FilterThresholds { min_reads: 3, max_read_error_rate: 0.1, max_base_error_rate: 0.2 };
1653        let ba_thresholds =
1654            FilterThresholds { min_reads: 2, max_read_error_rate: 0.15, max_base_error_rate: 0.25 };
1655
1656        let result =
1657            filter_duplex_read(&record, &cc_thresholds, &ab_thresholds, &ba_thresholds).unwrap();
1658        assert_eq!(result, FilterResult::InsufficientReads);
1659    }
1660
1661    #[test]
1662    fn test_filter_duplex_read_insufficient_ba_reads() {
1663        let mut record = create_test_record();
1664
1665        // Add final consensus tags (passing)
1666        record.data_mut().insert(Tag::from([b'c', b'D']), Value::UInt8(10));
1667        record.data_mut().insert(Tag::from([b'c', b'E']), Value::Float(0.01));
1668
1669        // Add AB/BA tags - BA has insufficient reads (min depth = 1 < ba_min_reads = 2)
1670        record.data_mut().insert(Tag::from([b'a', b'D']), Value::UInt8(5));
1671        record.data_mut().insert(Tag::from([b'b', b'D']), Value::UInt8(1));
1672        record.data_mut().insert(Tag::from([b'a', b'E']), Value::Float(0.01));
1673        record.data_mut().insert(Tag::from([b'b', b'E']), Value::Float(0.02));
1674
1675        let cc_thresholds =
1676            FilterThresholds { min_reads: 5, max_read_error_rate: 0.05, max_base_error_rate: 0.1 };
1677        let ab_thresholds =
1678            FilterThresholds { min_reads: 3, max_read_error_rate: 0.1, max_base_error_rate: 0.2 };
1679        let ba_thresholds =
1680            FilterThresholds { min_reads: 2, max_read_error_rate: 0.15, max_base_error_rate: 0.25 };
1681
1682        let result =
1683            filter_duplex_read(&record, &cc_thresholds, &ab_thresholds, &ba_thresholds).unwrap();
1684        assert_eq!(result, FilterResult::InsufficientReads);
1685    }
1686
1687    #[test]
1688    fn test_filter_duplex_read_consensus_fails_first() {
1689        let mut record = create_test_record();
1690
1691        // Add final consensus tags (failing - insufficient reads)
1692        record.data_mut().insert(Tag::from([b'c', b'D']), Value::UInt8(3));
1693        record.data_mut().insert(Tag::from([b'c', b'E']), Value::Float(0.01));
1694
1695        // Add AB/BA tags (both would pass if checked)
1696        record.data_mut().insert(Tag::from([b'a', b'D']), Value::UInt8(6));
1697        record.data_mut().insert(Tag::from([b'b', b'D']), Value::UInt8(4));
1698        record.data_mut().insert(Tag::from([b'a', b'E']), Value::Float(0.01));
1699        record.data_mut().insert(Tag::from([b'b', b'E']), Value::Float(0.02));
1700
1701        let cc_thresholds =
1702            FilterThresholds { min_reads: 5, max_read_error_rate: 0.05, max_base_error_rate: 0.1 };
1703        let ab_thresholds =
1704            FilterThresholds { min_reads: 3, max_read_error_rate: 0.1, max_base_error_rate: 0.2 };
1705        let ba_thresholds =
1706            FilterThresholds { min_reads: 2, max_read_error_rate: 0.15, max_base_error_rate: 0.25 };
1707
1708        let result =
1709            filter_duplex_read(&record, &cc_thresholds, &ab_thresholds, &ba_thresholds).unwrap();
1710        // Should fail on consensus check, not get to AB/BA checks
1711        assert_eq!(result, FilterResult::InsufficientReads);
1712    }
1713
1714    #[test]
1715    fn test_filter_duplex_read_with_only_one_strand() {
1716        let mut record = create_test_record();
1717
1718        // Add final consensus tags (passing)
1719        record.data_mut().insert(Tag::from([b'c', b'D']), Value::UInt8(10));
1720        record.data_mut().insert(Tag::from([b'c', b'E']), Value::Float(0.01));
1721
1722        // Add only AB tags (no BA)
1723        record.data_mut().insert(Tag::from([b'a', b'D']), Value::UInt8(6));
1724        record.data_mut().insert(Tag::from([b'a', b'E']), Value::Float(0.01));
1725
1726        let cc_thresholds =
1727            FilterThresholds { min_reads: 5, max_read_error_rate: 0.05, max_base_error_rate: 0.1 };
1728        let ab_thresholds =
1729            FilterThresholds { min_reads: 3, max_read_error_rate: 0.1, max_base_error_rate: 0.2 };
1730        let ba_thresholds =
1731            FilterThresholds { min_reads: 2, max_read_error_rate: 0.15, max_base_error_rate: 0.25 };
1732
1733        let result =
1734            filter_duplex_read(&record, &cc_thresholds, &ab_thresholds, &ba_thresholds).unwrap();
1735        // Should fail because BA has 0 reads (< ba_min_reads = 2)
1736        assert_eq!(result, FilterResult::InsufficientReads);
1737    }
1738
1739    // ========== Tests for mask_duplex_bases() ==========
1740
1741    fn create_duplex_record_with_per_base_tags() -> RecordBuf {
1742        let mut record = create_test_record();
1743
1744        // Add final consensus per-base tags (all bases have good depth and low error)
1745        // Total depth = 10, total errors = 1, error rate = 1/10 = 0.1
1746        record.data_mut().insert(
1747            Tag::from([b'c', b'd']),
1748            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt16(
1749                vec![10, 10, 10, 10, 10, 10, 10, 10],
1750            )),
1751        );
1752        record.data_mut().insert(
1753            Tag::from([b'c', b'e']),
1754            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt8(
1755                vec![1, 1, 1, 1, 1, 1, 1, 1], // 1 error out of 10 = 0.1 error rate
1756            )),
1757        );
1758
1759        // Add AB per-base tags (higher depth, lower error rate)
1760        // AB depth = 6, AB errors = 0, error rate = 0/6 = 0.0
1761        record.data_mut().insert(
1762            Tag::from([b'a', b'd']),
1763            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt16(
1764                vec![6, 6, 6, 6, 6, 6, 6, 6],
1765            )),
1766        );
1767        record.data_mut().insert(
1768            Tag::from([b'a', b'e']),
1769            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt8(
1770                vec![0, 0, 0, 0, 0, 0, 0, 0], // 0 errors out of 6 = 0.0 error rate
1771            )),
1772        );
1773
1774        // Add BA per-base tags (lower depth, higher error rate)
1775        // BA depth = 4, BA errors = 1, error rate = 1/4 = 0.25
1776        record.data_mut().insert(
1777            Tag::from([b'b', b'd']),
1778            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt16(
1779                vec![4, 4, 4, 4, 4, 4, 4, 4],
1780            )),
1781        );
1782        record.data_mut().insert(
1783            Tag::from([b'b', b'e']),
1784            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt8(
1785                vec![1, 1, 1, 1, 1, 1, 1, 1], // 1 error out of 4 = 0.25 error rate
1786            )),
1787        );
1788
1789        record
1790    }
1791
1792    #[test]
1793    fn test_mask_duplex_bases_all_pass() {
1794        let mut record = create_duplex_record_with_per_base_tags();
1795
1796        let cc_thresholds =
1797            FilterThresholds { min_reads: 5, max_read_error_rate: 0.05, max_base_error_rate: 0.15 };
1798        let ab_thresholds =
1799            FilterThresholds { min_reads: 3, max_read_error_rate: 0.1, max_base_error_rate: 0.25 };
1800        let ba_thresholds =
1801            FilterThresholds { min_reads: 2, max_read_error_rate: 0.15, max_base_error_rate: 0.3 };
1802
1803        mask_duplex_bases(
1804            &mut record,
1805            &cc_thresholds,
1806            &ab_thresholds,
1807            &ba_thresholds,
1808            Some(13),
1809            false,
1810        )
1811        .unwrap();
1812
1813        // All bases should pass - no Ns
1814        let seq = record.sequence();
1815        assert_eq!(seq.as_ref(), b"ACGTACGT");
1816    }
1817
1818    #[test]
1819    fn test_mask_duplex_bases_low_ab_depth() {
1820        let mut record = create_duplex_record_with_per_base_tags();
1821
1822        // Modify AB depth at position 2 to be too low
1823        record.data_mut().insert(
1824            Tag::from([b'a', b'd']),
1825            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt16(
1826                vec![6, 6, 2, 6, 6, 6, 6, 6], // Position 2 has low AB depth
1827            )),
1828        );
1829
1830        let cc_thresholds =
1831            FilterThresholds { min_reads: 5, max_read_error_rate: 0.05, max_base_error_rate: 0.15 };
1832        let ab_thresholds =
1833            FilterThresholds { min_reads: 3, max_read_error_rate: 0.1, max_base_error_rate: 0.25 };
1834        let ba_thresholds =
1835            FilterThresholds { min_reads: 2, max_read_error_rate: 0.15, max_base_error_rate: 0.3 };
1836
1837        mask_duplex_bases(
1838            &mut record,
1839            &cc_thresholds,
1840            &ab_thresholds,
1841            &ba_thresholds,
1842            Some(13),
1843            false,
1844        )
1845        .unwrap();
1846
1847        // Position 2 should be masked
1848        let seq = record.sequence();
1849        assert_eq!(seq.as_ref(), b"ACNTACGT");
1850    }
1851
1852    #[test]
1853    fn test_mask_duplex_bases_low_ba_depth() {
1854        let mut record = create_duplex_record_with_per_base_tags();
1855
1856        // Modify BA depth at positions 3 and 4 to be too low
1857        record.data_mut().insert(
1858            Tag::from([b'b', b'd']),
1859            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt16(
1860                vec![4, 4, 4, 1, 1, 4, 4, 4], // Positions 3,4 have low BA depth
1861            )),
1862        );
1863
1864        let cc_thresholds =
1865            FilterThresholds { min_reads: 5, max_read_error_rate: 0.05, max_base_error_rate: 0.15 };
1866        let ab_thresholds =
1867            FilterThresholds { min_reads: 3, max_read_error_rate: 0.1, max_base_error_rate: 0.25 };
1868        let ba_thresholds =
1869            FilterThresholds { min_reads: 2, max_read_error_rate: 0.15, max_base_error_rate: 0.3 };
1870
1871        mask_duplex_bases(
1872            &mut record,
1873            &cc_thresholds,
1874            &ab_thresholds,
1875            &ba_thresholds,
1876            Some(13),
1877            false,
1878        )
1879        .unwrap();
1880
1881        // Positions 3,4 should be masked
1882        let seq = record.sequence();
1883        assert_eq!(seq.as_ref(), b"ACGNNCGT");
1884    }
1885
1886    #[test]
1887    fn test_mask_duplex_bases_high_ab_error() {
1888        let mut record = create_duplex_record_with_per_base_tags();
1889
1890        // Modify AB error at position 5 to be too high
1891        // AB depth = 6, set error = 2, error rate = 2/6 = 0.33 > 0.25 threshold
1892        record.data_mut().insert(
1893            Tag::from([b'a', b'e']),
1894            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt8(
1895                vec![0, 0, 0, 0, 0, 2, 0, 0], // Position 5 has 2 errors: 2/6 = 0.33 > 0.25
1896            )),
1897        );
1898
1899        let cc_thresholds =
1900            FilterThresholds { min_reads: 5, max_read_error_rate: 0.05, max_base_error_rate: 0.15 };
1901        let ab_thresholds =
1902            FilterThresholds { min_reads: 3, max_read_error_rate: 0.1, max_base_error_rate: 0.25 };
1903        let ba_thresholds =
1904            FilterThresholds { min_reads: 2, max_read_error_rate: 0.15, max_base_error_rate: 0.3 };
1905
1906        mask_duplex_bases(
1907            &mut record,
1908            &cc_thresholds,
1909            &ab_thresholds,
1910            &ba_thresholds,
1911            Some(13),
1912            false,
1913        )
1914        .unwrap();
1915
1916        // Position 5 should be masked
1917        let seq = record.sequence();
1918        assert_eq!(seq.as_ref(), b"ACGTANGT");
1919    }
1920
1921    #[test]
1922    fn test_mask_duplex_bases_high_ba_error() {
1923        let mut record = create_duplex_record_with_per_base_tags();
1924
1925        // Modify BA error at positions 6 and 7 to be too high
1926        // BA depth = 4, set error = 2, error rate = 2/4 = 0.5 > 0.3 threshold
1927        record.data_mut().insert(
1928            Tag::from([b'b', b'e']),
1929            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt8(
1930                vec![1, 1, 1, 1, 1, 1, 2, 2], // Positions 6,7 have 2 errors: 2/4 = 0.5 > 0.3
1931            )),
1932        );
1933
1934        let cc_thresholds =
1935            FilterThresholds { min_reads: 5, max_read_error_rate: 0.05, max_base_error_rate: 0.15 };
1936        let ab_thresholds =
1937            FilterThresholds { min_reads: 3, max_read_error_rate: 0.1, max_base_error_rate: 0.25 };
1938        let ba_thresholds =
1939            FilterThresholds { min_reads: 2, max_read_error_rate: 0.15, max_base_error_rate: 0.3 };
1940
1941        mask_duplex_bases(
1942            &mut record,
1943            &cc_thresholds,
1944            &ab_thresholds,
1945            &ba_thresholds,
1946            Some(13),
1947            false,
1948        )
1949        .unwrap();
1950
1951        // Positions 6,7 should be masked
1952        let seq = record.sequence();
1953        assert_eq!(seq.as_ref(), b"ACGTACNN");
1954    }
1955
1956    #[test]
1957    fn test_mask_duplex_bases_low_total_depth() {
1958        let mut record = create_duplex_record_with_per_base_tags();
1959
1960        // Modify AB and BA depth at position 1 to make total depth too low
1961        // AB depth = 2, BA depth = 1, total = 3 < 5 threshold
1962        record.data_mut().insert(
1963            Tag::from([b'a', b'd']),
1964            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt16(
1965                vec![6, 2, 6, 6, 6, 6, 6, 6],
1966            )),
1967        );
1968        record.data_mut().insert(
1969            Tag::from([b'b', b'd']),
1970            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt16(
1971                vec![4, 1, 4, 4, 4, 4, 4, 4],
1972            )),
1973        );
1974
1975        let cc_thresholds =
1976            FilterThresholds { min_reads: 5, max_read_error_rate: 0.05, max_base_error_rate: 0.15 };
1977        let ab_thresholds =
1978            FilterThresholds { min_reads: 3, max_read_error_rate: 0.1, max_base_error_rate: 0.25 };
1979        let ba_thresholds =
1980            FilterThresholds { min_reads: 2, max_read_error_rate: 0.15, max_base_error_rate: 0.3 };
1981
1982        mask_duplex_bases(
1983            &mut record,
1984            &cc_thresholds,
1985            &ab_thresholds,
1986            &ba_thresholds,
1987            Some(13),
1988            false,
1989        )
1990        .unwrap();
1991
1992        // Position 1 should be masked
1993        let seq = record.sequence();
1994        assert_eq!(seq.as_ref(), b"ANGTACGT");
1995    }
1996
1997    #[test]
1998    fn test_mask_duplex_bases_high_total_error() {
1999        let mut record = create_duplex_record_with_per_base_tags();
2000
2001        // Modify AB and BA errors at position 0 to make total error rate too high
2002        // Total depth = 10 (AB=6, BA=4), total errors = 2 (AB=1, BA=1), rate = 2/10 = 0.2 > 0.15
2003        record.data_mut().insert(
2004            Tag::from([b'a', b'e']),
2005            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt8(
2006                vec![1, 0, 0, 0, 0, 0, 0, 0],
2007            )),
2008        );
2009        record.data_mut().insert(
2010            Tag::from([b'b', b'e']),
2011            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt8(
2012                vec![1, 1, 1, 1, 1, 1, 1, 1],
2013            )),
2014        );
2015
2016        let cc_thresholds =
2017            FilterThresholds { min_reads: 5, max_read_error_rate: 0.05, max_base_error_rate: 0.15 };
2018        let ab_thresholds =
2019            FilterThresholds { min_reads: 3, max_read_error_rate: 0.1, max_base_error_rate: 0.25 };
2020        let ba_thresholds =
2021            FilterThresholds { min_reads: 2, max_read_error_rate: 0.15, max_base_error_rate: 0.3 };
2022
2023        mask_duplex_bases(
2024            &mut record,
2025            &cc_thresholds,
2026            &ab_thresholds,
2027            &ba_thresholds,
2028            Some(13),
2029            false,
2030        )
2031        .unwrap();
2032
2033        // Position 0 should be masked
2034        let seq = record.sequence();
2035        assert_eq!(seq.as_ref(), b"NCGTACGT");
2036    }
2037
2038    #[test]
2039    fn test_mask_duplex_bases_multiple_failures() {
2040        let mut record = create_duplex_record_with_per_base_tags();
2041
2042        // Create multiple failure conditions
2043        // Position 0: low AB depth (2 < 3)
2044        record.data_mut().insert(
2045            Tag::from([b'a', b'd']),
2046            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt16(
2047                vec![2, 6, 6, 6, 6, 6, 6, 6],
2048            )),
2049        );
2050        // Position 1: low BA depth (1 < 2)
2051        record.data_mut().insert(
2052            Tag::from([b'b', b'd']),
2053            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt16(
2054                vec![4, 1, 4, 4, 4, 4, 4, 4],
2055            )),
2056        );
2057        // Position 2: high AB error (2/6 = 0.33 > 0.25)
2058        record.data_mut().insert(
2059            Tag::from([b'a', b'e']),
2060            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt8(
2061                vec![0, 0, 2, 0, 0, 0, 0, 0],
2062            )),
2063        );
2064
2065        let cc_thresholds =
2066            FilterThresholds { min_reads: 5, max_read_error_rate: 0.05, max_base_error_rate: 0.15 };
2067        let ab_thresholds =
2068            FilterThresholds { min_reads: 3, max_read_error_rate: 0.1, max_base_error_rate: 0.25 };
2069        let ba_thresholds =
2070            FilterThresholds { min_reads: 2, max_read_error_rate: 0.15, max_base_error_rate: 0.3 };
2071
2072        mask_duplex_bases(
2073            &mut record,
2074            &cc_thresholds,
2075            &ab_thresholds,
2076            &ba_thresholds,
2077            Some(13),
2078            false,
2079        )
2080        .unwrap();
2081
2082        // Positions 0,1,2 should be masked
2083        let seq = record.sequence();
2084        assert_eq!(seq.as_ref(), b"NNNTACGT");
2085    }
2086
2087    // ========== Tests for single-strand agreement ==========
2088
2089    #[test]
2090    fn test_mask_duplex_bases_single_strand_agreement_pass() {
2091        let mut record = create_duplex_record_with_per_base_tags();
2092
2093        // Add consensus base tags - all agree
2094        record.data_mut().insert(
2095            Tag::from([b'a', b'c']),
2096            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt8(
2097                vec![b'A', b'C', b'G', b'T', b'A', b'C', b'G', b'T'],
2098            )),
2099        );
2100        record.data_mut().insert(
2101            Tag::from([b'b', b'c']),
2102            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt8(
2103                vec![b'A', b'C', b'G', b'T', b'A', b'C', b'G', b'T'],
2104            )),
2105        );
2106
2107        let cc_thresholds =
2108            FilterThresholds { min_reads: 5, max_read_error_rate: 0.05, max_base_error_rate: 0.15 };
2109        let ab_thresholds =
2110            FilterThresholds { min_reads: 3, max_read_error_rate: 0.1, max_base_error_rate: 0.25 };
2111        let ba_thresholds =
2112            FilterThresholds { min_reads: 2, max_read_error_rate: 0.15, max_base_error_rate: 0.3 };
2113
2114        mask_duplex_bases(
2115            &mut record,
2116            &cc_thresholds,
2117            &ab_thresholds,
2118            &ba_thresholds,
2119            Some(13),
2120            true,
2121        )
2122        .unwrap();
2123
2124        // All bases agree - no masking
2125        let seq = record.sequence();
2126        assert_eq!(seq.as_ref(), b"ACGTACGT");
2127    }
2128
2129    #[test]
2130    fn test_mask_duplex_bases_single_strand_agreement_fail() {
2131        let mut record = create_duplex_record_with_per_base_tags();
2132
2133        // Add consensus base tags - positions 2, 5, 7 disagree
2134        record.data_mut().insert(
2135            Tag::from([b'a', b'c']),
2136            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt8(
2137                vec![b'A', b'C', b'G', b'T', b'A', b'C', b'G', b'T'],
2138            )),
2139        );
2140        record.data_mut().insert(
2141            Tag::from([b'b', b'c']),
2142            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt8(
2143                vec![b'A', b'C', b'T', b'T', b'A', b'G', b'G', b'A'], // Disagree at 2,5,7
2144            )),
2145        );
2146
2147        let cc_thresholds =
2148            FilterThresholds { min_reads: 5, max_read_error_rate: 0.05, max_base_error_rate: 0.15 };
2149        let ab_thresholds =
2150            FilterThresholds { min_reads: 3, max_read_error_rate: 0.1, max_base_error_rate: 0.25 };
2151        let ba_thresholds =
2152            FilterThresholds { min_reads: 2, max_read_error_rate: 0.15, max_base_error_rate: 0.3 };
2153
2154        mask_duplex_bases(
2155            &mut record,
2156            &cc_thresholds,
2157            &ab_thresholds,
2158            &ba_thresholds,
2159            Some(13),
2160            true,
2161        )
2162        .unwrap();
2163
2164        // Positions 2,5,7 should be masked due to disagreement
2165        let seq = record.sequence();
2166        assert_eq!(seq.as_ref(), b"ACNTANGN");
2167    }
2168
2169    #[test]
2170    fn test_mask_duplex_bases_single_strand_agreement_disabled() {
2171        let mut record = create_duplex_record_with_per_base_tags();
2172
2173        // Add consensus base tags - positions 2, 5, 7 disagree
2174        record.data_mut().insert(
2175            Tag::from([b'a', b'c']),
2176            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt8(
2177                vec![b'A', b'C', b'G', b'T', b'A', b'C', b'G', b'T'],
2178            )),
2179        );
2180        record.data_mut().insert(
2181            Tag::from([b'b', b'c']),
2182            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt8(
2183                vec![b'A', b'C', b'T', b'T', b'A', b'G', b'G', b'A'], // Disagree at 2,5,7
2184            )),
2185        );
2186
2187        let cc_thresholds =
2188            FilterThresholds { min_reads: 5, max_read_error_rate: 0.05, max_base_error_rate: 0.15 };
2189        let ab_thresholds =
2190            FilterThresholds { min_reads: 3, max_read_error_rate: 0.1, max_base_error_rate: 0.25 };
2191        let ba_thresholds =
2192            FilterThresholds { min_reads: 2, max_read_error_rate: 0.15, max_base_error_rate: 0.3 };
2193
2194        // Call with require_ss_agreement = false
2195        mask_duplex_bases(
2196            &mut record,
2197            &cc_thresholds,
2198            &ab_thresholds,
2199            &ba_thresholds,
2200            Some(13),
2201            false,
2202        )
2203        .unwrap();
2204
2205        // No masking due to disagreement when disabled
2206        let seq = record.sequence();
2207        assert_eq!(seq.as_ref(), b"ACGTACGT");
2208    }
2209
2210    #[test]
2211    fn test_mask_duplex_bases_combined_failures_with_agreement() {
2212        let mut record = create_duplex_record_with_per_base_tags();
2213
2214        // Create depth failure at position 0 (low AB depth: 2 < 3)
2215        record.data_mut().insert(
2216            Tag::from([b'a', b'd']),
2217            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt16(
2218                vec![2, 6, 6, 6, 6, 6, 6, 6],
2219            )),
2220        );
2221
2222        // Create disagreement at positions 3 and 5
2223        record.data_mut().insert(
2224            Tag::from([b'a', b'c']),
2225            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt8(
2226                vec![b'A', b'C', b'G', b'T', b'A', b'C', b'G', b'T'],
2227            )),
2228        );
2229        record.data_mut().insert(
2230            Tag::from([b'b', b'c']),
2231            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt8(
2232                vec![b'A', b'C', b'G', b'A', b'A', b'G', b'G', b'T'], // Disagree at 3,5
2233            )),
2234        );
2235
2236        let cc_thresholds =
2237            FilterThresholds { min_reads: 5, max_read_error_rate: 0.05, max_base_error_rate: 0.15 };
2238        let ab_thresholds =
2239            FilterThresholds { min_reads: 3, max_read_error_rate: 0.1, max_base_error_rate: 0.25 };
2240        let ba_thresholds =
2241            FilterThresholds { min_reads: 2, max_read_error_rate: 0.15, max_base_error_rate: 0.3 };
2242
2243        mask_duplex_bases(
2244            &mut record,
2245            &cc_thresholds,
2246            &ab_thresholds,
2247            &ba_thresholds,
2248            Some(13),
2249            true,
2250        )
2251        .unwrap();
2252
2253        // Positions 0 (depth), 3 (disagreement), 5 (disagreement) should be masked
2254        let seq = record.sequence();
2255        assert_eq!(seq.as_ref(), b"NCGNANGT");
2256    }
2257
2258    #[test]
2259    fn test_extract_per_base_array_int16() {
2260        // Test that Int16 arrays are handled correctly (negative values clamped to 0)
2261        use noodles::sam::alignment::record_buf::data::field::value::Array;
2262
2263        let mut data = noodles::sam::alignment::record_buf::Data::default();
2264        let tag = Tag::from([b'c', b'D']);
2265        data.insert(tag, Value::Array(Array::Int16(vec![5, 10, -1, 15, 0])));
2266
2267        let result = extract_per_base_array(&data, "cD", 5);
2268        // Negative values should be clamped to 0
2269        assert_eq!(result, vec![5, 10, 0, 15, 0]);
2270    }
2271
2272    #[test]
2273    fn test_masked_base_quality_phred_2() {
2274        // Test that masked bases get quality=2 (Phred MIN_VALUE), not 0
2275        let mut record = RecordBuilder::new().sequence("ACGT").qualities(&[30, 30, 30, 30]).build();
2276
2277        // Set up low depth at position 0 and 2 (array tags need direct insertion)
2278        record.data_mut().insert(
2279            Tag::from([b'c', b'd']),
2280            Value::Array(noodles::sam::alignment::record_buf::data::field::value::Array::UInt16(
2281                vec![1, 10, 1, 10],
2282            )),
2283        );
2284
2285        let thresholds =
2286            FilterThresholds { min_reads: 5, max_read_error_rate: 1.0, max_base_error_rate: 1.0 };
2287
2288        mask_bases(&mut record, &thresholds, Some(10)).unwrap();
2289
2290        // Positions 0 and 2 should be masked to N with quality=2
2291        let seq: Vec<u8> = record.sequence().iter().collect();
2292        let quals: Vec<u8> = record.quality_scores().iter().collect();
2293        assert_eq!(seq, b"NCNT");
2294        assert_eq!(
2295            quals,
2296            vec![2, 30, 2, 30],
2297            "Masked bases should have quality=2 (Phred MIN_VALUE)"
2298        );
2299    }
2300
2301    #[test]
2302    fn test_error_rate_f64_comparison() {
2303        // Test that error rates are properly compared using f64
2304        // This ensures the f32 -> f64 promotion works correctly
2305        let thresholds = FilterThresholds {
2306            min_reads: 1,
2307            max_read_error_rate: 0.1,  // f64
2308            max_base_error_rate: 0.15, // f64
2309        };
2310
2311        // Error rate just under the threshold (as f32)
2312        let error_rate: f32 = 0.099;
2313        assert!(
2314            f64::from(error_rate) <= thresholds.max_read_error_rate,
2315            "f32 -> f64 comparison should work correctly"
2316        );
2317
2318        // Error rate just over the threshold (as f32)
2319        let error_rate_high: f32 = 0.101;
2320        assert!(
2321            f64::from(error_rate_high) > thresholds.max_read_error_rate,
2322            "f32 -> f64 comparison should catch values over threshold"
2323        );
2324    }
2325
2326    #[test]
2327    fn test_ac_bc_string_tag_handling() {
2328        // Test that ac/bc tags can be handled as String type (not just Array)
2329        // This is important for CODEC consensus reads where these may be stored differently
2330        let mut record = RecordBuilder::new()
2331            .sequence("ACGT")
2332            .qualities(&[30, 30, 30, 30])
2333            .tag("ac", "ACGT")
2334            .tag("bc", "ACGT")
2335            .build();
2336
2337        // These should be handled gracefully even if not array format
2338        // The function should not panic
2339        let cc_thresholds =
2340            FilterThresholds { min_reads: 1, max_read_error_rate: 1.0, max_base_error_rate: 1.0 };
2341        let ab_thresholds =
2342            FilterThresholds { min_reads: 1, max_read_error_rate: 1.0, max_base_error_rate: 1.0 };
2343        let ba_thresholds =
2344            FilterThresholds { min_reads: 1, max_read_error_rate: 1.0, max_base_error_rate: 1.0 };
2345
2346        // Should not panic even with String tags
2347        let result = mask_duplex_bases(
2348            &mut record,
2349            &cc_thresholds,
2350            &ab_thresholds,
2351            &ba_thresholds,
2352            Some(10),
2353            true,
2354        );
2355        assert!(result.is_ok());
2356    }
2357
2358    // ========== Tests for find_string_or_uint8_array ==========
2359
2360    #[test]
2361    fn test_find_string_or_uint8_array_z_tag() {
2362        // Build aux data with a Z-type string tag: ac:Z:ACGT
2363        let mut aux = Vec::new();
2364        aux.extend_from_slice(b"ac"); // tag
2365        aux.push(b'Z'); // type
2366        aux.extend_from_slice(b"ACGT\0"); // value + NUL
2367
2368        let result = super::find_string_or_uint8_array(&aux, *b"ac");
2369        assert_eq!(result, Some(b"ACGT".to_vec()));
2370    }
2371
2372    #[test]
2373    fn test_find_string_or_uint8_array_b_uint8_tag() {
2374        // Build aux data with a B-type UInt8 array tag: ac:B:C,65,67,71,84
2375        let mut aux = Vec::new();
2376        aux.extend_from_slice(b"ac"); // tag
2377        aux.push(b'B'); // type = array
2378        aux.push(b'C'); // sub-type = UInt8
2379        aux.extend_from_slice(&4u32.to_le_bytes()); // count = 4
2380        aux.extend_from_slice(&[65u8, 67, 71, 84]); // A, C, G, T
2381
2382        let result = super::find_string_or_uint8_array(&aux, *b"ac");
2383        assert_eq!(result, Some(vec![65u8, 67, 71, 84]));
2384    }
2385
2386    #[test]
2387    fn test_find_string_or_uint8_array_missing_tag() {
2388        let aux: Vec<u8> = Vec::new();
2389        let result = super::find_string_or_uint8_array(&aux, *b"ac");
2390        assert!(result.is_none());
2391    }
2392
2393    #[test]
2394    fn test_find_string_or_uint8_array_wrong_array_type() {
2395        // Build aux data with a B-type Int16 array — should return None since not UInt8
2396        let mut aux = Vec::new();
2397        aux.extend_from_slice(b"ac"); // tag
2398        aux.push(b'B'); // type = array
2399        aux.push(b's'); // sub-type = Int16
2400        aux.extend_from_slice(&2u32.to_le_bytes()); // count = 2
2401        aux.extend_from_slice(&1i16.to_le_bytes());
2402        aux.extend_from_slice(&2i16.to_le_bytes());
2403
2404        let result = super::find_string_or_uint8_array(&aux, *b"ac");
2405        assert!(result.is_none());
2406    }
2407}