Skip to main content

fgumi_sam/
record_utils.rs

1//! Record-level utilities for SAM/BAM records.
2//!
3//! This module provides utilities for working with individual SAM records, including:
4//! - Position mapping between read and reference coordinates
5//! - FR pair detection using BAM tags
6//! - Mate position calculations from MC tag
7//! - CIGAR string parsing and analysis
8
9use noodles::sam::alignment::RecordBuf;
10use noodles::sam::alignment::record::Cigar as CigarTrait;
11use noodles::sam::alignment::record::cigar::op::Kind;
12use noodles::sam::alignment::record::data::field::Tag;
13use noodles::sam::alignment::record_buf::data::field::Value;
14
15/// Pair orientation for paired-end reads.
16///
17/// This enum represents the orientation of read pairs based on
18/// the relative positioning and strand of the two reads in a pair.
19///
20/// Note: This is NOT the orientation byte as used in Picard's `MarkDuplicates`.
21#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
22pub enum PairOrientation {
23    /// Forward-Reverse orientation ("innie") - reads face each other:
24    /// ```text
25    /// 5' --F-->       <--R-- 5'
26    /// ```
27    /// The positive strand 5' position is less than the negative strand 5' position.
28    FR,
29
30    /// Reverse-Forward orientation ("outie") - reads face away from each other:
31    /// ```text
32    /// <--R-- 5'       5' --F-->
33    /// ```
34    /// The positive strand 5' position is greater than or equal to the negative strand 5' position.
35    RF,
36
37    /// Tandem orientation - both reads on the same strand:
38    /// ```text
39    /// 5' --F-->  5' --F-->   (both forward)
40    /// <--R-- 5'  <--R-- 5'   (both reverse)
41    /// ```
42    Tandem,
43}
44
45/// Returns the 1-based read position corresponding to a reference position.
46///
47/// This matches fgbio's `readPosAtRefPos` function (via htsjdk's `getReadPositionAtReferencePosition`).
48///
49/// # Arguments
50/// * `record` - The SAM record
51/// * `ref_pos` - The 1-based reference position to query
52/// * `return_last_base_if_deleted` - If true, returns the last aligned base position when
53///   the `ref_pos` falls in a deletion; if false, returns 0
54///
55/// # Returns
56/// * 1-based read position, or 0 if:
57///   - The position is not found within the alignment
58///   - The position falls in a deletion (when `return_last_base_if_deleted` is false)
59///   - The record has no alignment start
60#[must_use]
61pub fn read_pos_at_ref_pos(
62    record: &RecordBuf,
63    ref_pos: usize,
64    return_last_base_if_deleted: bool,
65) -> usize {
66    let Some(alignment_start) = record.alignment_start().map(usize::from) else {
67        return 0;
68    };
69
70    // Walk the CIGAR tracking both read and reference positions
71    let mut read_pos: usize = 0; // 0-based position in read
72    let mut ref_cursor = alignment_start; // 1-based reference position
73    let mut last_aligned_read_pos: usize = 0;
74
75    for op_result in record.cigar().iter() {
76        let Ok(op) = op_result else {
77            continue;
78        };
79        let len = op.len();
80
81        match op.kind() {
82            // Consumes both read and reference
83            Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => {
84                // Check if ref_pos falls within this aligned segment
85                if ref_pos >= ref_cursor && ref_pos < ref_cursor + len {
86                    // Found it! Return the 1-based read position
87                    let offset = ref_pos - ref_cursor;
88                    return read_pos + offset + 1; // +1 for 1-based
89                }
90                last_aligned_read_pos = read_pos + len; // Track last aligned position
91                read_pos += len;
92                ref_cursor += len;
93            }
94            // Consumes only read (insertion, soft clip)
95            Kind::Insertion | Kind::SoftClip => {
96                read_pos += len;
97            }
98            // Consumes only reference (deletion, skip)
99            Kind::Deletion | Kind::Skip => {
100                // Check if ref_pos falls within this deletion/skip
101                if ref_pos >= ref_cursor && ref_pos < ref_cursor + len {
102                    return if return_last_base_if_deleted && last_aligned_read_pos > 0 {
103                        last_aligned_read_pos // 1-based (already 1-based because of how we track)
104                    } else {
105                        0
106                    };
107                }
108                ref_cursor += len;
109            }
110            // Hard clip doesn't consume read bases (they're gone)
111            Kind::HardClip | Kind::Pad => {}
112        }
113    }
114
115    // Position not found within alignment
116    0
117}
118
119/// Checks if a single read is part of an FR (forward-reverse) pair using BAM tags.
120///
121/// This function determines FR pair status from a single read using information
122/// available in the BAM record (mate position, template length, strand flags).
123/// This is useful when you don't have access to the mate record.
124///
125/// Returns `true` if:
126/// - Read is paired
127/// - Both read and mate are mapped
128/// - Both are on the same reference
129/// - The pair is in FR orientation (positive strand 5' < negative strand 5')
130///
131/// This matches fgbio's `isFrPair` algorithm using htsjdk's pair orientation logic.
132#[must_use]
133pub fn is_fr_pair_from_tags(read: &RecordBuf) -> bool {
134    let flags = read.flags();
135
136    // Must be paired
137    if !flags.is_segmented() {
138        return false;
139    }
140
141    // Both read and mate must be mapped
142    if flags.is_unmapped() || flags.is_mate_unmapped() {
143        return false;
144    }
145
146    // Must be on the same reference as mate
147    if read.reference_sequence_id() != read.mate_reference_sequence_id() {
148        return false;
149    }
150
151    // Delegate to get_pair_orientation for the FR/RF/Tandem determination
152    get_pair_orientation(read) == PairOrientation::FR
153}
154
155/// Parses the MC (mate CIGAR) tag from a record, returning the parsed CIGAR ops
156/// and the mate alignment start position.
157///
158/// Returns `None` if the MC tag is missing/invalid or mate position is missing.
159fn parse_mate_cigar(read: &RecordBuf) -> Option<(Vec<(Kind, usize)>, usize)> {
160    let mc_tag = Tag::from([b'M', b'C']);
161    let mc_value = read.data().get(&mc_tag)?;
162
163    let cigar_str = match mc_value {
164        Value::String(s) => std::str::from_utf8(s.as_ref()).ok()?,
165        _ => return None,
166    };
167
168    let mate_start = usize::from(read.mate_alignment_start()?);
169    let ops = parse_cigar_string(cigar_str);
170    if ops.is_empty() {
171        return None;
172    }
173    Some((ops, mate_start))
174}
175
176/// Gets the mate's unclipped start position from the MC tag and mate position.
177///
178/// Calculates: `mateUnclippedStart = mateStart - leadingClipping`
179///
180/// Returns the mate's unclipped start as a signed integer.
181///
182/// This uses signed arithmetic to properly handle cases where the unclipped start
183/// would be before position 1 (i.e., negative in 1-based coordinates).
184/// This matches fgbio's behavior which uses signed integers for these calculations.
185///
186/// Returns `None` if the MC tag is missing or invalid, or if mate position is missing.
187#[must_use]
188pub fn mate_unclipped_start(read: &RecordBuf) -> Option<isize> {
189    let (ops, mate_start) = parse_mate_cigar(read)?;
190
191    #[expect(
192        clippy::cast_possible_wrap,
193        reason = "genomic positions fit in isize on all supported platforms (64-bit)"
194    )]
195    let mate_start_signed = mate_start as isize;
196    #[expect(
197        clippy::cast_possible_wrap,
198        reason = "CIGAR clipping lengths are small relative to isize::MAX"
199    )]
200    let leading_clip = leading_clipping(&ops) as isize;
201
202    Some(mate_start_signed - leading_clip)
203}
204
205/// Gets the mate's unclipped end position from the MC tag and mate position.
206///
207/// Calculates: `mateUnclippedEnd = mateStart + refLen - 1 + trailingClipping`
208///
209/// Returns `None` if the MC tag is missing or invalid, or if mate position is missing.
210#[must_use]
211pub fn mate_unclipped_end(read: &RecordBuf) -> Option<usize> {
212    let (ops, mate_start) = parse_mate_cigar(read)?;
213    let ref_len = cigar_reference_length(&ops);
214    let trailing_clip = trailing_clipping(&ops);
215
216    // mateStart + refLen - 1 + trailingClipping
217    Some(mate_start + ref_len - 1 + trailing_clip)
218}
219
220/// Calculates the number of bases in a read that extend past the mate's unclipped boundary.
221///
222/// This matches fgbio's `numBasesExtendingPastMate` function exactly.
223/// Uses tag-based mate information (MC tag, mate position, template length).
224///
225/// Returns 0 if not an FR pair or if required tags are missing.
226#[must_use]
227pub fn num_bases_extending_past_mate(read: &RecordBuf) -> usize {
228    // Only applies to FR pairs
229    if !is_fr_pair_from_tags(read) {
230        return 0;
231    }
232
233    let is_positive_strand = !read.flags().is_reverse_complemented();
234
235    // Get the read's CIGAR ops for soft clip calculations
236    let ops: Vec<(Kind, usize)> = read
237        .cigar()
238        .iter()
239        .filter_map(std::result::Result::ok)
240        .map(|op| (op.kind(), op.len()))
241        .collect();
242
243    // Get read length (total bases in the read, excluding hard clips)
244    let read_length: usize = ops
245        .iter()
246        .filter(|(kind, _)| {
247            matches!(
248                kind,
249                Kind::Match
250                    | Kind::SequenceMatch
251                    | Kind::SequenceMismatch
252                    | Kind::Insertion
253                    | Kind::SoftClip
254            )
255        })
256        .map(|(_, len)| len)
257        .sum();
258
259    if is_positive_strand {
260        // Positive strand: check if read extends past mate's unclipped end
261        let Some(read_alignment_end) = alignment_end(read) else {
262            return 0;
263        };
264        let mate_unclipped_end_pos = mate_unclipped_end(read).unwrap_or(usize::MAX);
265
266        if read_alignment_end >= mate_unclipped_end_pos {
267            // Aligned portion reaches or extends past mate's unclipped end
268            // Use readPosAtRefPos to find where in the read the mate ends
269            // fgbio: Math.max(0, rec.length - rec.readPosAtRefPos(pos=mateUnclippedEnd, returnLastBaseIfDeleted=false))
270            let pos_at_mate_end = read_pos_at_ref_pos(read, mate_unclipped_end_pos, false);
271            // When pos_at_mate_end is 0 (position in deletion or outside alignment),
272            // fgbio's formula gives: max(0, read_length - 0) = read_length (clip all)
273            read_length.saturating_sub(pos_at_mate_end)
274        } else {
275            // Aligned portion ends before mate's unclipped end
276            // Only clip excess soft-clipped bases that extend past the mate
277            let trailing_soft_clip = trailing_soft_clipping(&ops);
278            let gap = mate_unclipped_end_pos - read_alignment_end;
279            trailing_soft_clip.saturating_sub(gap)
280        }
281    } else {
282        // Negative strand: check if read extends before mate's unclipped start
283        let Some(read_alignment_start) = read.alignment_start().map(usize::from) else {
284            return 0;
285        };
286        #[expect(
287            clippy::cast_possible_wrap,
288            reason = "genomic positions fit in isize on all supported platforms (64-bit)"
289        )]
290        let read_alignment_start_signed = read_alignment_start as isize;
291        let mate_unclipped_start_pos = mate_unclipped_start(read).unwrap_or(0);
292
293        if read_alignment_start_signed <= mate_unclipped_start_pos {
294            // Aligned portion starts at or before mate's unclipped start
295            // Use readPosAtRefPos to find where in the read the mate starts
296            // fgbio: Math.max(0, rec.readPosAtRefPos(pos=mateUnclippedStart, returnLastBaseIfDeleted=false) - 1)
297            // Note: mate_unclipped_start_pos could be negative, but read_pos_at_ref_pos expects usize
298            // If mate_unclipped_start_pos is negative, treat it as 0 for the lookup
299            let lookup_pos = if mate_unclipped_start_pos < 0 {
300                0
301            } else {
302                #[expect(
303                    clippy::cast_sign_loss,
304                    reason = "value is verified non-negative by the if-check above"
305                )]
306                let pos = mate_unclipped_start_pos as usize;
307                pos
308            };
309            let pos_at_mate_start = read_pos_at_ref_pos(read, lookup_pos, false);
310            // When pos_at_mate_start is 0 (position in deletion or outside alignment),
311            // fgbio's formula gives: max(0, 0 - 1) = 0 (clip nothing)
312            // pos_at_mate_start is 1-based, so subtract 1 to get bases before it
313            pos_at_mate_start.saturating_sub(1)
314        } else {
315            // Aligned portion starts after mate's unclipped start
316            // Only clip excess soft-clipped bases that extend before the mate
317            // Use signed arithmetic to properly handle negative mate_unclipped_start
318            #[expect(
319                clippy::cast_possible_wrap,
320                reason = "soft clipping lengths are small relative to isize::MAX"
321            )]
322            let leading_soft_clip = leading_soft_clipping(&ops) as isize;
323            let gap = read_alignment_start_signed - mate_unclipped_start_pos;
324            // gap is always positive here (since read_alignment_start > mate_unclipped_start)
325            // clip = leading_soft_clip - gap, clamped to 0
326            if leading_soft_clip > gap {
327                #[expect(
328                    clippy::cast_sign_loss,
329                    reason = "result is guaranteed positive by the if-check above"
330                )]
331                let result = (leading_soft_clip - gap) as usize;
332                result
333            } else {
334                0
335            }
336        }
337    }
338}
339
340/// Gets the read's alignment end position (1-based, inclusive).
341///
342/// Calculated as: `alignment_start + reference_length - 1`
343#[must_use]
344pub fn alignment_end(read: &RecordBuf) -> Option<usize> {
345    let start = usize::from(read.alignment_start()?);
346    let ref_len = reference_length(&read.cigar());
347    Some(start + ref_len - 1)
348}
349
350/// Parses a CIGAR string and returns it as a vector of (Kind, length) operations.
351///
352/// This is useful for parsing the MC (mate CIGAR) tag value.
353#[must_use]
354pub fn parse_cigar_string(cigar_str: &str) -> Vec<(Kind, usize)> {
355    let mut ops = Vec::new();
356    let mut num_str = String::new();
357
358    for ch in cigar_str.chars() {
359        if ch.is_ascii_digit() {
360            num_str.push(ch);
361        } else {
362            let len: usize = num_str.parse().unwrap_or(0);
363            num_str.clear();
364
365            let kind = match ch {
366                'M' => Kind::Match,
367                'I' => Kind::Insertion,
368                'D' => Kind::Deletion,
369                'N' => Kind::Skip,
370                'S' => Kind::SoftClip,
371                'H' => Kind::HardClip,
372                'P' => Kind::Pad,
373                '=' => Kind::SequenceMatch,
374                'X' => Kind::SequenceMismatch,
375                _ => continue,
376            };
377
378            if len > 0 {
379                ops.push((kind, len));
380            }
381        }
382    }
383
384    ops
385}
386
387/// Calculates the reference length consumed by CIGAR operations.
388#[must_use]
389pub fn cigar_reference_length(ops: &[(Kind, usize)]) -> usize {
390    ops.iter()
391        .filter_map(|(kind, len)| match kind {
392            Kind::Match
393            | Kind::Deletion
394            | Kind::Skip
395            | Kind::SequenceMatch
396            | Kind::SequenceMismatch => Some(*len),
397            _ => None,
398        })
399        .sum()
400}
401
402/// Calculates leading clipping (soft + hard) from CIGAR operations.
403#[must_use]
404pub fn leading_clipping(ops: &[(Kind, usize)]) -> usize {
405    ops.iter()
406        .take_while(|(kind, _)| matches!(kind, Kind::SoftClip | Kind::HardClip))
407        .map(|(_, len)| *len)
408        .sum()
409}
410
411/// Calculates trailing clipping (soft + hard) from CIGAR operations.
412#[must_use]
413pub fn trailing_clipping(ops: &[(Kind, usize)]) -> usize {
414    ops.iter()
415        .rev()
416        .take_while(|(kind, _)| matches!(kind, Kind::SoftClip | Kind::HardClip))
417        .map(|(_, len)| *len)
418        .sum()
419}
420
421/// Calculates leading soft clipping only from CIGAR operations.
422#[must_use]
423pub fn leading_soft_clipping(ops: &[(Kind, usize)]) -> usize {
424    ops.iter()
425        .skip_while(|(kind, _)| *kind == Kind::HardClip)
426        .take_while(|(kind, _)| *kind == Kind::SoftClip)
427        .map(|(_, len)| *len)
428        .sum()
429}
430
431/// Calculates trailing soft clipping only from CIGAR operations.
432#[must_use]
433pub fn trailing_soft_clipping(ops: &[(Kind, usize)]) -> usize {
434    ops.iter()
435        .rev()
436        .skip_while(|(kind, _)| *kind == Kind::HardClip)
437        .take_while(|(kind, _)| *kind == Kind::SoftClip)
438        .map(|(_, len)| *len)
439        .sum()
440}
441
442/// Collects CIGAR operations from a record into a Vec for use with clipping functions.
443#[must_use]
444fn cigar_to_ops(record: &RecordBuf) -> Vec<(Kind, usize)> {
445    record.cigar().as_ref().iter().map(|op| (op.kind(), op.len())).collect()
446}
447
448/// Gets the unclipped start position of a read (alignment start minus leading clips).
449///
450/// This matches HTSJDK's `SAMRecord.getUnclippedStart()` behavior, which includes
451/// both soft clips and hard clips.
452///
453/// Returns `None` for unmapped reads.
454#[must_use]
455pub fn unclipped_start(record: &RecordBuf) -> Option<usize> {
456    if record.flags().is_unmapped() {
457        return None;
458    }
459    let start = usize::from(record.alignment_start()?);
460    let leading = leading_clipping(&cigar_to_ops(record));
461    Some(start.saturating_sub(leading))
462}
463
464/// Gets the unclipped end position of a read (alignment end plus trailing clips).
465///
466/// This matches HTSJDK's `SAMRecord.getUnclippedEnd()` behavior, which includes
467/// both soft clips and hard clips.
468///
469/// Returns `None` for unmapped reads.
470#[must_use]
471pub fn unclipped_end(record: &RecordBuf) -> Option<usize> {
472    if record.flags().is_unmapped() {
473        return None;
474    }
475    let start = usize::from(record.alignment_start()?);
476    let ref_len = reference_length(&record.cigar());
477    let trailing = trailing_clipping(&cigar_to_ops(record));
478    // alignment_end = start + ref_len - 1
479    // unclipped_end = alignment_end + trailing_clips
480    Some(start + ref_len.saturating_sub(1) + trailing)
481}
482
483/// Gets the unclipped 5' position of a read.
484///
485/// For forward strand reads, returns the unclipped start position.
486/// For reverse strand reads, returns the unclipped end position (the 5' end).
487///
488/// This matches fgbio's `positionOf` behavior in `GroupReadsByUmi`.
489///
490/// Returns `None` for unmapped reads.
491#[must_use]
492pub fn unclipped_five_prime_position(record: &RecordBuf) -> Option<usize> {
493    if record.flags().is_unmapped() {
494        return None;
495    }
496    if record.flags().is_reverse_complemented() {
497        unclipped_end(record)
498    } else {
499        unclipped_start(record)
500    }
501}
502
503/// Counts reference-consuming operations from a CIGAR.
504#[expect(
505    clippy::redundant_closure_for_method_calls,
506    reason = "Op::len is not a method on the trait"
507)]
508#[must_use]
509pub fn reference_length(cigar: &impl CigarTrait) -> usize {
510    cigar
511        .iter()
512        .filter_map(Result::ok)
513        .filter(|op| {
514            matches!(
515                op.kind(),
516                Kind::Match
517                    | Kind::SequenceMatch
518                    | Kind::SequenceMismatch
519                    | Kind::Deletion
520                    | Kind::Skip
521            )
522        })
523        .map(|op| op.len())
524        .sum()
525}
526
527/// Get pair orientation using htsjdk's algorithm.
528/// This matches htsjdk's `SamPairUtil.getPairOrientation()` exactly.
529#[must_use]
530pub fn get_pair_orientation(record: &RecordBuf) -> PairOrientation {
531    let is_reverse = record.flags().is_reverse_complemented();
532    let mate_reverse = record.flags().is_mate_reverse_complemented();
533
534    // Same strand = TANDEM
535    if is_reverse == mate_reverse {
536        return PairOrientation::Tandem;
537    }
538
539    // Now determine if FR or RF using htsjdk's logic
540    let alignment_start = record.alignment_start().map_or(0, usize::from);
541    let mate_start = record.mate_alignment_start().map_or(0, usize::from);
542    let insert_size = record.template_length();
543
544    // Genomic positions always fit in i64
545    #[expect(
546        clippy::cast_possible_wrap,
547        reason = "genomic positions are non-negative and fit in i64 on all supported platforms"
548    )]
549    let (positive_five_prime, negative_five_prime) = if is_reverse {
550        // This read is on reverse strand, mate is on positive strand
551        let ref_len = reference_length(&record.cigar());
552        let end = alignment_start + ref_len.saturating_sub(1);
553        (mate_start as i64, end as i64)
554    } else {
555        // This read is on positive strand, mate is on reverse strand
556        (alignment_start as i64, alignment_start as i64 + i64::from(insert_size))
557    };
558
559    // FR if positive strand 5' < negative strand 5'
560    if positive_five_prime < negative_five_prime {
561        PairOrientation::FR
562    } else {
563        PairOrientation::RF
564    }
565}
566
567/// Check if a read pair is in FR (forward-reverse) orientation.
568///
569/// FR orientation means one read is on the forward strand and one on the reverse strand,
570/// with the positive strand 5' end before the negative strand 5' end.
571///
572/// This matches the behavior of fgbio's `isFrPair` check.
573#[must_use]
574pub fn is_fr_pair(r1: &RecordBuf, r2: &RecordBuf) -> bool {
575    let r1_flags = r1.flags();
576    let r2_flags = r2.flags();
577
578    // Check if both reads are paired
579    if !r1_flags.is_segmented() || !r2_flags.is_segmented() {
580        return false;
581    }
582
583    // Check if both reads are mapped
584    if r1_flags.is_unmapped() || r2_flags.is_unmapped() {
585        return false;
586    }
587
588    // Check if both mates are mapped
589    if r1_flags.is_mate_unmapped() || r2_flags.is_mate_unmapped() {
590        return false;
591    }
592
593    // Check if both reads are on the same reference sequence
594    let r1_ref = r1.reference_sequence_id();
595    let r2_ref = r2.reference_sequence_id();
596    if r1_ref != r2_ref {
597        return false;
598    }
599
600    // Use htsjdk's pair orientation logic (matches fgbio)
601    // FR means one read is forward, one is reverse, with positive strand 5' < negative strand 5'
602    // This works regardless of which read is R1 vs R2
603    let orientation = get_pair_orientation(r1);
604    orientation == PairOrientation::FR
605}
606
607#[cfg(test)]
608mod tests {
609    use super::*;
610    use crate::builder::{RecordBuilder, RecordPairBuilder};
611    use noodles::sam::alignment::record::Flags;
612
613    // Flag constants for test readability
614    const FLAG_PAIRED: u16 = 0x1;
615    const FLAG_READ1: u16 = 0x40;
616    const FLAG_REVERSE: u16 = 0x10;
617    const FLAG_MATE_REVERSE: u16 = 0x20;
618    const FLAG_UNMAPPED: u16 = 0x4;
619    const FLAG_MATE_UNMAPPED: u16 = 0x8;
620
621    // =====================================================================
622    // Tests for parse_cigar_string
623    // =====================================================================
624
625    #[test]
626    fn test_parse_cigar_string() {
627        let ops = parse_cigar_string("10M5I20M");
628        assert_eq!(ops.len(), 3);
629        assert_eq!(ops[0], (Kind::Match, 10));
630        assert_eq!(ops[1], (Kind::Insertion, 5));
631        assert_eq!(ops[2], (Kind::Match, 20));
632    }
633
634    #[test]
635    fn test_parse_cigar_string_with_clips() {
636        let ops = parse_cigar_string("5H10S50M10S5H");
637        assert_eq!(ops.len(), 5);
638        assert_eq!(ops[0], (Kind::HardClip, 5));
639        assert_eq!(ops[1], (Kind::SoftClip, 10));
640        assert_eq!(ops[2], (Kind::Match, 50));
641        assert_eq!(ops[3], (Kind::SoftClip, 10));
642        assert_eq!(ops[4], (Kind::HardClip, 5));
643    }
644
645    #[test]
646    fn test_cigar_reference_length() {
647        let ops = parse_cigar_string("10M5I20M5D10M");
648        assert_eq!(cigar_reference_length(&ops), 45); // 10 + 20 + 5 + 10
649    }
650
651    #[test]
652    fn test_leading_clipping() {
653        let ops = parse_cigar_string("5H10S50M10S5H");
654        assert_eq!(leading_clipping(&ops), 15); // 5H + 10S
655    }
656
657    #[test]
658    fn test_trailing_clipping() {
659        let ops = parse_cigar_string("5H10S50M10S5H");
660        assert_eq!(trailing_clipping(&ops), 15); // 10S + 5H
661    }
662
663    #[test]
664    fn test_leading_soft_clipping() {
665        let ops = parse_cigar_string("5H10S50M10S5H");
666        assert_eq!(leading_soft_clipping(&ops), 10); // Only 10S, not 5H
667    }
668
669    #[test]
670    fn test_trailing_soft_clipping() {
671        let ops = parse_cigar_string("5H10S50M10S5H");
672        assert_eq!(trailing_soft_clipping(&ops), 10); // Only 10S, not 5H
673    }
674
675    // =====================================================================
676    // Tests for is_fr_pair_from_tags
677    // =====================================================================
678
679    /// Helper to create a read for FR pair tests
680    fn create_fr_test_read(
681        name: &str,
682        flags: u16,
683        pos: usize,
684        mate_pos: usize,
685        tlen: i32,
686        ref_id: usize,
687        mate_ref_id: usize,
688    ) -> RecordBuf {
689        let is_reverse = (flags & FLAG_REVERSE) != 0;
690        let is_mate_reverse = (flags & FLAG_MATE_REVERSE) != 0;
691        let is_unmapped = (flags & FLAG_UNMAPPED) != 0;
692        let is_mate_unmapped = (flags & FLAG_MATE_UNMAPPED) != 0;
693        let is_paired = (flags & FLAG_PAIRED) != 0;
694        let is_read1 = (flags & FLAG_READ1) != 0;
695
696        let mut builder = RecordBuilder::new()
697            .name(name)
698            .sequence(&"A".repeat(100))
699            .qualities(&[30u8; 100])
700            .cigar("100M")
701            .reference_sequence_id(ref_id)
702            .alignment_start(pos)
703            .mate_reference_sequence_id(mate_ref_id)
704            .mate_alignment_start(mate_pos)
705            .template_length(tlen)
706            .reverse_complement(is_reverse)
707            .mate_reverse_complement(is_mate_reverse)
708            .unmapped(is_unmapped)
709            .mate_unmapped(is_mate_unmapped);
710
711        if is_paired {
712            builder = builder.first_segment(is_read1);
713        }
714
715        builder.build()
716    }
717
718    #[test]
719    fn test_is_fr_pair_true_for_fr_orientation() {
720        // FR pair: positive strand read at pos 100, mate on negative strand
721        // Insert size positive (300), so negative 5' = 100 + 300 = 400
722        // positive 5' (100) < negative 5' (400) -> FR
723        let read = create_fr_test_read(
724            "fr_pair",
725            FLAG_PAIRED | FLAG_READ1 | FLAG_MATE_REVERSE,
726            100,
727            200,
728            300,
729            0,
730            0,
731        );
732        assert!(is_fr_pair_from_tags(&read), "Should be FR pair");
733    }
734
735    #[test]
736    fn test_is_fr_pair_false_for_rf_orientation() {
737        // RF pair: negative insert size
738        let read = create_fr_test_read(
739            "rf_pair",
740            FLAG_PAIRED | FLAG_READ1 | FLAG_MATE_REVERSE,
741            63837,
742            62870,
743            -967,
744            0,
745            0,
746        );
747        assert!(!is_fr_pair_from_tags(&read), "Should NOT be FR pair: RF orientation");
748    }
749
750    #[test]
751    fn test_is_fr_pair_false_for_tandem() {
752        // Tandem: both reads on same strand
753        let read = create_fr_test_read(
754            "tandem_pair",
755            FLAG_PAIRED | FLAG_READ1, // both positive strand
756            100,
757            200,
758            300,
759            0,
760            0,
761        );
762        assert!(!is_fr_pair_from_tags(&read), "Should NOT be FR pair: tandem");
763    }
764
765    #[test]
766    fn test_is_fr_pair_false_for_unpaired() {
767        // Create unpaired read - no flags set means not SEGMENTED
768        let record = RecordBuilder::new().name("unpaired").sequence("ACGT").build();
769        assert!(!is_fr_pair_from_tags(&record), "Should NOT be FR pair: unpaired");
770    }
771
772    #[test]
773    fn test_is_fr_pair_false_for_unmapped() {
774        let read = create_fr_test_read(
775            "unmapped",
776            FLAG_PAIRED | FLAG_READ1 | FLAG_UNMAPPED | FLAG_MATE_REVERSE,
777            100,
778            200,
779            300,
780            0,
781            0,
782        );
783        assert!(!is_fr_pair_from_tags(&read), "Should NOT be FR pair: unmapped");
784    }
785
786    #[test]
787    fn test_is_fr_pair_false_for_mate_unmapped() {
788        let read = create_fr_test_read(
789            "mate_unmapped",
790            FLAG_PAIRED | FLAG_READ1 | FLAG_MATE_UNMAPPED | FLAG_MATE_REVERSE,
791            100,
792            200,
793            300,
794            0,
795            0,
796        );
797        assert!(!is_fr_pair_from_tags(&read), "Should NOT be FR pair: mate unmapped");
798    }
799
800    #[test]
801    fn test_is_fr_pair_false_for_different_references() {
802        let read = create_fr_test_read(
803            "diff_ref",
804            FLAG_PAIRED | FLAG_READ1 | FLAG_MATE_REVERSE,
805            100,
806            200,
807            300,
808            0,
809            1, // different reference
810        );
811        assert!(!is_fr_pair_from_tags(&read), "Should NOT be FR pair: different refs");
812    }
813
814    #[test]
815    fn test_is_fr_pair_false_for_rf_when_read_on_negative_strand() {
816        // RF pair: read on negative strand, mate on positive strand
817        // For negative strand read:
818        //   positive 5' = mate_start (100)
819        //   negative 5' = alignment_end (this read, pos 200 + 100bp = 300)
820        // If mate_start (100) > alignment_end... wait, that would be FR
821        // Let's make mate_start > alignment_end: mate at 400, read ends at 300
822        // positive 5' (400) < negative 5' (300) is FALSE -> RF
823        let read = create_fr_test_read(
824            "rf_negative_strand",
825            FLAG_PAIRED | FLAG_READ1 | FLAG_REVERSE, // negative strand, mate positive
826            200,                                     // alignment_start (alignment_end will be ~300)
827            400,                                     // mate_alignment_start (positive strand 5')
828            -200,                                    // template_length
829            0,                                       // ref_id
830            0,                                       // mate_ref_id
831        );
832
833        assert!(
834            !is_fr_pair_from_tags(&read),
835            "Should NOT be FR pair: RF orientation (mate 5' > read 5')"
836        );
837    }
838
839    #[test]
840    fn test_is_fr_pair_true_for_fr_when_read_on_negative_strand() {
841        // FR pair: read on negative strand, mate on positive strand
842        // For negative strand read:
843        //   positive 5' = mate_start
844        //   negative 5' = alignment_end (this read's end)
845        // FR when positive 5' < negative 5'
846        // mate at 100, read ends at 300 -> 100 < 300 -> FR
847        let read = create_fr_test_read(
848            "fr_negative_strand",
849            FLAG_PAIRED | FLAG_READ1 | FLAG_REVERSE, // negative strand, mate positive
850            200,                                     // alignment_start (alignment_end will be ~300)
851            100,                                     // mate_alignment_start (positive strand 5')
852            200,                                     // template_length
853            0,                                       // ref_id
854            0,                                       // mate_ref_id
855        );
856
857        assert!(is_fr_pair_from_tags(&read), "Should be FR pair: mate 5' (100) < read end (300)");
858    }
859
860    #[test]
861    fn test_is_fr_pair_false_for_tandem_both_reverse() {
862        // Tandem: both reads on negative strand
863        let read = create_fr_test_read(
864            "tandem_pair_reverse",
865            FLAG_PAIRED | FLAG_READ1 | FLAG_REVERSE | FLAG_MATE_REVERSE, // both reverse
866            100,
867            200,
868            300,
869            0,
870            0,
871        );
872
873        assert!(
874            !is_fr_pair_from_tags(&read),
875            "Should NOT be FR pair: tandem orientation (both reverse)"
876        );
877    }
878
879    // =====================================================================
880    // Tests for read_pos_at_ref_pos
881    // =====================================================================
882
883    /// Helper to create a read with specific CIGAR for `read_pos_at_ref_pos` tests.
884    /// Sequence and qualities are auto-generated from CIGAR length.
885    fn create_cigar_test_read(name: &str, pos: usize, cigar: &str) -> RecordBuf {
886        RecordBuilder::new()
887            .name(name)
888            .cigar(cigar)
889            .reference_sequence_id(0)
890            .alignment_start(pos)
891            .first_segment(true)
892            .build()
893    }
894
895    #[test]
896    fn test_read_pos_at_ref_pos_simple_match() {
897        // Read at position 100 with 50M CIGAR
898        let read = create_cigar_test_read("simple", 100, "50M");
899
900        assert_eq!(read_pos_at_ref_pos(&read, 100, false), 1);
901        assert_eq!(read_pos_at_ref_pos(&read, 110, false), 11);
902        assert_eq!(read_pos_at_ref_pos(&read, 149, false), 50);
903        assert_eq!(read_pos_at_ref_pos(&read, 150, false), 0); // outside
904        assert_eq!(read_pos_at_ref_pos(&read, 99, false), 0); // before
905    }
906
907    #[test]
908    fn test_read_pos_at_ref_pos_with_insertion() {
909        // 10M5I10M at position 100
910        let read = create_cigar_test_read("insertion", 100, "10M5I10M");
911
912        assert_eq!(read_pos_at_ref_pos(&read, 105, false), 6);
913        assert_eq!(read_pos_at_ref_pos(&read, 115, false), 21); // 10 + 5 insertion + 6
914    }
915
916    #[test]
917    fn test_read_pos_at_ref_pos_with_deletion() {
918        // 10M5D10M at position 100
919        let read = create_cigar_test_read("deletion", 100, "10M5D10M");
920
921        // Position within deletion
922        assert_eq!(read_pos_at_ref_pos(&read, 112, false), 0);
923        assert_eq!(read_pos_at_ref_pos(&read, 112, true), 10); // last aligned position
924        assert_eq!(read_pos_at_ref_pos(&read, 115, false), 11); // after deletion
925    }
926
927    #[test]
928    fn test_read_pos_at_ref_pos_with_soft_clip() {
929        // 5S10M at position 100
930        let read = create_cigar_test_read("softclip", 100, "5S10M");
931
932        assert_eq!(read_pos_at_ref_pos(&read, 100, false), 6); // after 5bp soft clip
933        assert_eq!(read_pos_at_ref_pos(&read, 105, false), 11);
934    }
935
936    // =====================================================================
937    // Tests for mate_unclipped_start and mate_unclipped_end
938    // =====================================================================
939
940    /// Helper to create a read with MC tag
941    fn create_mc_test_read(name: &str, mate_pos: usize, mc_cigar: &str) -> RecordBuf {
942        RecordBuilder::new()
943            .name(name)
944            .sequence(&"A".repeat(50))
945            .qualities(&[30u8; 50])
946            .cigar("50M")
947            .reference_sequence_id(0)
948            .alignment_start(100)
949            .mate_reference_sequence_id(0)
950            .mate_alignment_start(mate_pos)
951            .first_segment(true)
952            .tag("MC", mc_cigar)
953            .build()
954    }
955
956    #[test]
957    fn test_mate_unclipped_start_no_clipping() {
958        let read = create_mc_test_read("no_clip", 200, "50M");
959        assert_eq!(mate_unclipped_start(&read), Some(200));
960    }
961
962    #[test]
963    fn test_mate_unclipped_start_with_soft_clip() {
964        // 10bp leading soft clip: unclipped start = 200 - 10 = 190
965        let read = create_mc_test_read("soft_clip", 200, "10S50M");
966        assert_eq!(mate_unclipped_start(&read), Some(190));
967    }
968
969    #[test]
970    fn test_mate_unclipped_start_with_hard_clip() {
971        // 5H + 10S = 15bp leading clip: unclipped start = 200 - 15 = 185
972        let read = create_mc_test_read("hard_clip", 200, "5H10S50M");
973        assert_eq!(mate_unclipped_start(&read), Some(185));
974    }
975
976    #[test]
977    fn test_mate_unclipped_end_no_clipping() {
978        // refLen = 50, unclipped end = 200 + 50 - 1 = 249
979        let read = create_mc_test_read("no_clip", 200, "50M");
980        assert_eq!(mate_unclipped_end(&read), Some(249));
981    }
982
983    #[test]
984    fn test_mate_unclipped_end_with_soft_clip() {
985        // refLen = 50, 10bp trailing soft clip: unclipped end = 200 + 50 - 1 + 10 = 259
986        let read = create_mc_test_read("soft_clip", 200, "50M10S");
987        assert_eq!(mate_unclipped_end(&read), Some(259));
988    }
989
990    #[test]
991    fn test_mate_unclipped_end_with_deletion() {
992        // refLen = 25 + 5 + 25 = 55: unclipped end = 200 + 55 - 1 = 254
993        let read = create_mc_test_read("deletion", 200, "25M5D25M");
994        assert_eq!(mate_unclipped_end(&read), Some(254));
995    }
996
997    #[test]
998    fn test_mate_unclipped_end_with_trailing_hard_clip() {
999        // refLen = 50, trailing = 5S + 3H = 8: unclipped end = 200 + 50 - 1 + 8 = 257
1000        let read = create_mc_test_read("hard_clip", 200, "50M5S3H");
1001        assert_eq!(mate_unclipped_end(&read), Some(257));
1002    }
1003
1004    #[test]
1005    fn test_mate_unclipped_no_mc_tag() {
1006        // Read without MC tag
1007        let record = RecordBuilder::new()
1008            .name("no_mc")
1009            .sequence("ACGT")
1010            .reference_sequence_id(0)
1011            .alignment_start(100)
1012            .mate_reference_sequence_id(0)
1013            .mate_alignment_start(200)
1014            .first_segment(true)
1015            .build();
1016
1017        assert_eq!(mate_unclipped_start(&record), None);
1018        assert_eq!(mate_unclipped_end(&record), None);
1019    }
1020
1021    #[test]
1022    fn test_mate_unclipped_invalid_utf8_mc_tag() {
1023        use noodles::sam::alignment::record_buf::data::field::value::Value;
1024        let mc_tag = Tag::from([b'M', b'C']);
1025        // Create a read then replace the MC tag with invalid UTF-8 bytes
1026        let mut record = create_mc_test_read("invalid_utf8", 200, "50M");
1027        record.data_mut().insert(mc_tag, Value::String(vec![0xFF, 0xFE].into()));
1028
1029        assert_eq!(mate_unclipped_start(&record), None);
1030        assert_eq!(mate_unclipped_end(&record), None);
1031    }
1032
1033    #[test]
1034    fn test_mate_unclipped_empty_cigar_mc_tag() {
1035        let record = create_mc_test_read("empty_cigar", 200, "");
1036        assert_eq!(mate_unclipped_start(&record), None);
1037        assert_eq!(mate_unclipped_end(&record), None);
1038    }
1039
1040    // =====================================================================
1041    // Tests for get_pair_orientation (ported from htsjdk SamPairUtilTest)
1042    // =====================================================================
1043
1044    #[test]
1045    fn test_get_pair_orientation_fr_normal() {
1046        // FR (innie): positive strand at 100, negative strand 5' further right
1047        // Read on + strand at 100, mate on - strand, insert size 200
1048        // positive 5' = 100, negative 5' = 100 + 200 = 300 -> 100 < 300 -> FR
1049        let read = create_fr_test_read(
1050            "fr_normal",
1051            FLAG_PAIRED | FLAG_READ1 | FLAG_MATE_REVERSE,
1052            100,
1053            150,
1054            200,
1055            0,
1056            0,
1057        );
1058        assert_eq!(get_pair_orientation(&read), PairOrientation::FR);
1059    }
1060
1061    #[test]
1062    fn test_get_pair_orientation_fr_overlapping() {
1063        // FR overlapping: reads face each other and overlap
1064        let read = create_fr_test_read(
1065            "fr_overlap",
1066            FLAG_PAIRED | FLAG_READ1 | FLAG_MATE_REVERSE,
1067            100,
1068            120,
1069            100,
1070            0,
1071            0,
1072        );
1073        assert_eq!(get_pair_orientation(&read), PairOrientation::FR);
1074    }
1075
1076    #[test]
1077    fn test_get_pair_orientation_rf_outie() {
1078        // RF (outie): positive strand 5' >= negative strand 5'
1079        // Read on + strand at 200, mate on - strand at 100, negative insert size
1080        // positive 5' = 200, negative 5' = 200 + (-100) = 100 -> 200 >= 100 -> RF
1081        let read = create_fr_test_read(
1082            "rf_outie",
1083            FLAG_PAIRED | FLAG_READ1 | FLAG_MATE_REVERSE,
1084            200,
1085            100,
1086            -100,
1087            0,
1088            0,
1089        );
1090        assert_eq!(get_pair_orientation(&read), PairOrientation::RF);
1091    }
1092
1093    #[test]
1094    fn test_get_pair_orientation_tandem_both_forward() {
1095        // Tandem: both reads on same strand (both forward)
1096        let read = create_fr_test_read(
1097            "tandem_fwd",
1098            FLAG_PAIRED | FLAG_READ1, // no FLAG_MATE_REVERSE, both forward
1099            100,
1100            200,
1101            200,
1102            0,
1103            0,
1104        );
1105        assert_eq!(get_pair_orientation(&read), PairOrientation::Tandem);
1106    }
1107
1108    #[test]
1109    fn test_get_pair_orientation_tandem_both_reverse() {
1110        // Tandem: both reads on same strand (both reverse)
1111        let read = create_fr_test_read(
1112            "tandem_rev",
1113            FLAG_PAIRED | FLAG_READ1 | FLAG_REVERSE | FLAG_MATE_REVERSE,
1114            100,
1115            200,
1116            200,
1117            0,
1118            0,
1119        );
1120        assert_eq!(get_pair_orientation(&read), PairOrientation::Tandem);
1121    }
1122
1123    #[test]
1124    fn test_get_pair_orientation_fr_from_reverse_strand() {
1125        // FR when read is on reverse strand: mate on + strand
1126        // For reverse strand read:
1127        //   positive 5' = mate_start (100)
1128        //   negative 5' = alignment_end (~200 + 100 - 1 = 299)
1129        // 100 < 299 -> FR
1130        let read = create_fr_test_read(
1131            "fr_reverse",
1132            FLAG_PAIRED | FLAG_READ1 | FLAG_REVERSE, // read reverse, mate forward
1133            200,
1134            100,
1135            200,
1136            0,
1137            0,
1138        );
1139        assert_eq!(get_pair_orientation(&read), PairOrientation::FR);
1140    }
1141
1142    #[test]
1143    fn test_get_pair_orientation_rf_from_reverse_strand() {
1144        // RF when read is on reverse strand
1145        // For reverse strand read:
1146        //   positive 5' = mate_start (400)
1147        //   negative 5' = alignment_end (~200 + 100 - 1 = 299)
1148        // 400 >= 299 -> RF
1149        let read = create_fr_test_read(
1150            "rf_reverse",
1151            FLAG_PAIRED | FLAG_READ1 | FLAG_REVERSE, // read reverse, mate forward
1152            200,
1153            400,
1154            -200,
1155            0,
1156            0,
1157        );
1158        assert_eq!(get_pair_orientation(&read), PairOrientation::RF);
1159    }
1160
1161    // =====================================================================
1162    // Tests for is_fr_pair (two-record version)
1163    // =====================================================================
1164
1165    /// Helper to create a pair of reads for `is_fr_pair` tests
1166    fn create_read_pair(
1167        r1_pos: usize,
1168        r1_reverse: bool,
1169        r2_pos: usize,
1170        r2_reverse: bool,
1171        same_ref: bool,
1172    ) -> (RecordBuf, RecordBuf) {
1173        let mut builder = RecordPairBuilder::new()
1174            .name("read_pair")
1175            .r1_sequence(&"A".repeat(100))
1176            .r2_sequence(&"A".repeat(100))
1177            .r1_start(r1_pos)
1178            .r2_start(r2_pos)
1179            .r1_reverse(r1_reverse)
1180            .r2_reverse(r2_reverse);
1181
1182        if !same_ref {
1183            builder = builder.r2_reference_sequence_id(1);
1184        }
1185
1186        builder.build()
1187    }
1188
1189    #[test]
1190    fn test_is_fr_pair_two_records_fr() {
1191        // FR pair: R1 forward at 100, R2 reverse at 200
1192        let (r1, r2) = create_read_pair(100, false, 200, true, true);
1193        assert!(is_fr_pair(&r1, &r2), "Should detect FR pair");
1194    }
1195
1196    #[test]
1197    fn test_is_fr_pair_two_records_rf() {
1198        // RF pair: R1 reverse at 100, R2 forward at 50 (outie)
1199        // This creates an RF orientation where positive 5' > negative 5'
1200        let (r1, r2) = create_read_pair(200, false, 100, true, true);
1201        // With these positions, positive 5' = 200, and with negative tlen,
1202        // negative 5' = 200 + (-50) = 150, so 200 > 150 -> RF
1203        let mut r1_rf = r1;
1204        *r1_rf.template_length_mut() = -50;
1205        assert!(!is_fr_pair(&r1_rf, &r2), "Should NOT detect FR pair for RF orientation");
1206    }
1207
1208    #[test]
1209    fn test_is_fr_pair_two_records_tandem() {
1210        // Tandem: both forward
1211        let (r1, r2) = create_read_pair(100, false, 200, false, true);
1212        assert!(!is_fr_pair(&r1, &r2), "Should NOT detect FR pair for tandem");
1213    }
1214
1215    #[test]
1216    fn test_is_fr_pair_two_records_different_refs() {
1217        // Different references
1218        let (r1, r2) = create_read_pair(100, false, 200, true, false);
1219        assert!(!is_fr_pair(&r1, &r2), "Should NOT detect FR pair for different refs");
1220    }
1221
1222    #[test]
1223    fn test_is_fr_pair_two_records_unpaired() {
1224        // Create unpaired reads - no paired flags set
1225        let r1 = RecordBuilder::new()
1226            .name("unpaired")
1227            .sequence("ACGT")
1228            .reference_sequence_id(0)
1229            .alignment_start(100)
1230            .build();
1231
1232        let r2 = RecordBuilder::new()
1233            .name("unpaired")
1234            .sequence("ACGT")
1235            .reference_sequence_id(0)
1236            .alignment_start(200)
1237            .build();
1238
1239        assert!(!is_fr_pair(&r1, &r2), "Should NOT detect FR pair for unpaired reads");
1240    }
1241
1242    #[test]
1243    fn test_is_fr_pair_two_records_unmapped() {
1244        // Create pair with unmapped read
1245        let (mut r1, r2) = create_read_pair(100, false, 200, true, true);
1246        *r1.flags_mut() = Flags::from(FLAG_PAIRED | FLAG_READ1 | FLAG_UNMAPPED | FLAG_MATE_REVERSE);
1247        assert!(!is_fr_pair(&r1, &r2), "Should NOT detect FR pair for unmapped read");
1248    }
1249
1250    // =====================================================================
1251    // Tests for edge cases
1252    // =====================================================================
1253
1254    #[test]
1255    fn test_parse_cigar_string_empty() {
1256        let ops = parse_cigar_string("");
1257        assert!(ops.is_empty());
1258    }
1259
1260    #[test]
1261    fn test_parse_cigar_string_all_operations() {
1262        // Test all CIGAR operation types
1263        let ops = parse_cigar_string("10M5I3D2N7S4H1P6=8X");
1264        assert_eq!(ops.len(), 9);
1265        assert_eq!(ops[0], (Kind::Match, 10));
1266        assert_eq!(ops[1], (Kind::Insertion, 5));
1267        assert_eq!(ops[2], (Kind::Deletion, 3));
1268        assert_eq!(ops[3], (Kind::Skip, 2));
1269        assert_eq!(ops[4], (Kind::SoftClip, 7));
1270        assert_eq!(ops[5], (Kind::HardClip, 4));
1271        assert_eq!(ops[6], (Kind::Pad, 1));
1272        assert_eq!(ops[7], (Kind::SequenceMatch, 6));
1273        assert_eq!(ops[8], (Kind::SequenceMismatch, 8));
1274    }
1275
1276    #[test]
1277    fn test_cigar_reference_length_with_all_ops() {
1278        // Reference-consuming ops: M, D, N, =, X
1279        // Non-consuming: I, S, H, P
1280        let ops = parse_cigar_string("10M5I3D2N7S4H1P6=8X");
1281        // Reference length = 10(M) + 3(D) + 2(N) + 6(=) + 8(X) = 29
1282        assert_eq!(cigar_reference_length(&ops), 29);
1283    }
1284
1285    #[test]
1286    fn test_leading_clipping_only_hard_clip() {
1287        let ops = parse_cigar_string("10H50M");
1288        assert_eq!(leading_clipping(&ops), 10);
1289        assert_eq!(leading_soft_clipping(&ops), 0);
1290    }
1291
1292    #[test]
1293    fn test_trailing_clipping_only_hard_clip() {
1294        let ops = parse_cigar_string("50M10H");
1295        assert_eq!(trailing_clipping(&ops), 10);
1296        assert_eq!(trailing_soft_clipping(&ops), 0);
1297    }
1298
1299    #[test]
1300    fn test_alignment_end_simple() {
1301        let record = RecordBuilder::new()
1302            .sequence(&"A".repeat(50))
1303            .alignment_start(100)
1304            .cigar("50M")
1305            .build();
1306        // end = 100 + 50 - 1 = 149
1307        assert_eq!(alignment_end(&record), Some(149));
1308    }
1309
1310    #[test]
1311    fn test_alignment_end_with_deletion() {
1312        let record = RecordBuilder::new()
1313            .sequence(&"A".repeat(50))
1314            .alignment_start(100)
1315            .cigar("25M5D25M")
1316            .build();
1317        // ref_len = 25 + 5 + 25 = 55, end = 100 + 55 - 1 = 154
1318        assert_eq!(alignment_end(&record), Some(154));
1319    }
1320
1321    #[test]
1322    fn test_alignment_end_no_alignment_start() {
1323        // Record with no alignment start set
1324        let record = RecordBuilder::new().sequence("ACGT").build();
1325        assert_eq!(alignment_end(&record), None);
1326    }
1327
1328    #[test]
1329    fn test_read_pos_at_ref_pos_complex_cigar() {
1330        // Test with skip (N) operation: 10M5N10M at position 100
1331        let read = create_cigar_test_read("skip", 100, "10M5N10M");
1332
1333        // Position 105 is within first match block -> read pos 6
1334        assert_eq!(read_pos_at_ref_pos(&read, 105, false), 6);
1335        // Position 112 is within skip -> returns 0 (not in deletion)
1336        assert_eq!(read_pos_at_ref_pos(&read, 112, false), 0);
1337        // Position 115 is in second match block -> read pos 11
1338        assert_eq!(read_pos_at_ref_pos(&read, 115, false), 11);
1339    }
1340
1341    #[test]
1342    fn test_read_pos_at_ref_pos_with_sequence_match_mismatch() {
1343        // Test with =/X operations: 5=3X5= at position 100
1344        let read = create_cigar_test_read("eq_x", 100, "5=3X5=");
1345
1346        assert_eq!(read_pos_at_ref_pos(&read, 100, false), 1);
1347        assert_eq!(read_pos_at_ref_pos(&read, 105, false), 6); // in mismatch block
1348        assert_eq!(read_pos_at_ref_pos(&read, 108, false), 9); // in second match block
1349    }
1350
1351    // =========================================================================
1352    // Tests for unclipped_start, unclipped_end, unclipped_five_prime_position
1353    // These match HTSJDK/fgbio behavior including both soft AND hard clips
1354    // =========================================================================
1355
1356    #[test]
1357    fn test_unclipped_start_no_clips() {
1358        // Simple 50M at position 100 -> unclipped_start = 100
1359        let read = create_cigar_test_read("simple", 100, "50M");
1360        assert_eq!(unclipped_start(&read), Some(100));
1361    }
1362
1363    #[test]
1364    fn test_unclipped_start_with_leading_soft_clip() {
1365        // 5S45M at position 100 -> unclipped_start = 100 - 5 = 95
1366        let read = create_cigar_test_read("soft", 100, "5S45M");
1367        assert_eq!(unclipped_start(&read), Some(95));
1368    }
1369
1370    #[test]
1371    fn test_unclipped_start_with_leading_hard_clip() {
1372        // 10H50M at position 100 -> unclipped_start = 100 - 10 = 90
1373        let read = create_cigar_test_read("hard", 100, "10H50M");
1374        assert_eq!(unclipped_start(&read), Some(90));
1375    }
1376
1377    #[test]
1378    fn test_unclipped_start_with_soft_and_hard_clips() {
1379        // Matches fgbio test: 5S45M10H at position 10 -> unclipped_start = 10 - 5 = 5
1380        // (trailing hard clip doesn't affect unclipped_start)
1381        let read = create_cigar_test_read("both", 10, "5S45M10H");
1382        assert_eq!(unclipped_start(&read), Some(5));
1383    }
1384
1385    #[test]
1386    fn test_unclipped_start_with_leading_hard_and_soft() {
1387        // 3H5S42M at position 100 -> unclipped_start = 100 - 5 - 3 = 92
1388        let read = create_cigar_test_read("hard_soft", 100, "3H5S42M");
1389        assert_eq!(unclipped_start(&read), Some(92));
1390    }
1391
1392    #[test]
1393    fn test_unclipped_end_no_clips() {
1394        // Simple 50M at position 100 -> alignment_end = 149, unclipped_end = 149
1395        let read = create_cigar_test_read("simple", 100, "50M");
1396        assert_eq!(unclipped_end(&read), Some(149));
1397    }
1398
1399    #[test]
1400    fn test_unclipped_end_with_trailing_soft_clip() {
1401        // 45M5S at position 100 -> alignment_end = 144, unclipped_end = 144 + 5 = 149
1402        let read = create_cigar_test_read("soft", 100, "45M5S");
1403        assert_eq!(unclipped_end(&read), Some(149));
1404    }
1405
1406    #[test]
1407    fn test_unclipped_end_with_trailing_hard_clip() {
1408        // 50M10H at position 100 -> alignment_end = 149, unclipped_end = 149 + 10 = 159
1409        let read = create_cigar_test_read("hard", 100, "50M10H");
1410        assert_eq!(unclipped_end(&read), Some(159));
1411    }
1412
1413    #[test]
1414    fn test_unclipped_end_with_soft_and_hard_clips() {
1415        // Matches fgbio test: 5S45M10H at position 10
1416        // alignment_end = 10 + 45 - 1 = 54
1417        // unclipped_end = 54 + 10 = 64
1418        let read = create_cigar_test_read("both", 10, "5S45M10H");
1419        assert_eq!(unclipped_end(&read), Some(64));
1420    }
1421
1422    #[test]
1423    fn test_unclipped_end_with_trailing_soft_and_hard() {
1424        // 42M5S3H at position 100 -> alignment_end = 141, unclipped_end = 141 + 5 + 3 = 149
1425        let read = create_cigar_test_read("soft_hard", 100, "42M5S3H");
1426        assert_eq!(unclipped_end(&read), Some(149));
1427    }
1428
1429    #[test]
1430    fn test_unclipped_five_prime_forward_strand() {
1431        // Forward strand: 5' is at unclipped_start
1432        // 5S45M10H at position 10 -> unclipped_start = 5
1433        let read = create_cigar_test_read("fwd", 10, "5S45M10H");
1434        assert_eq!(unclipped_five_prime_position(&read), Some(5));
1435    }
1436
1437    #[test]
1438    fn test_unclipped_five_prime_reverse_strand() {
1439        // Reverse strand: 5' is at unclipped_end
1440        // 5S45M10H at position 10 -> unclipped_end = 64
1441        let read = RecordBuilder::new()
1442            .name("rev")
1443            .sequence(&"A".repeat(60))
1444            .reference_sequence_id(0)
1445            .alignment_start(10)
1446            .cigar("5S45M10H")
1447            .flags(Flags::REVERSE_COMPLEMENTED)
1448            .build();
1449        assert_eq!(unclipped_five_prime_position(&read), Some(64));
1450    }
1451
1452    #[test]
1453    fn test_unclipped_positions_unmapped_returns_none() {
1454        let read =
1455            RecordBuilder::new().name("unmapped").sequence("ACGT").flags(Flags::UNMAPPED).build();
1456        assert_eq!(unclipped_start(&read), None);
1457        assert_eq!(unclipped_end(&read), None);
1458        assert_eq!(unclipped_five_prime_position(&read), None);
1459    }
1460
1461    #[test]
1462    fn test_unclipped_end_with_deletion() {
1463        // 25M5D25M at position 100
1464        // ref_len = 25 + 5 + 25 = 55
1465        // alignment_end = 100 + 55 - 1 = 154
1466        let read = create_cigar_test_read("del", 100, "25M5D25M");
1467        assert_eq!(unclipped_end(&read), Some(154));
1468    }
1469
1470    #[test]
1471    fn test_unclipped_end_with_insertion() {
1472        // 25M5I25M at position 100
1473        // ref_len = 25 + 25 = 50 (insertion doesn't consume reference)
1474        // alignment_end = 100 + 50 - 1 = 149
1475        let read = create_cigar_test_read("ins", 100, "25M5I25M");
1476        assert_eq!(unclipped_end(&read), Some(149));
1477    }
1478}