Skip to main content

fgumi_lib/
read_info.rs

1//! Data structures for tracking read position information.
2//!
3//! This module provides the `ReadInfo` structure for tracking genomic position information
4//! about reads, which is essential for:
5//! - Grouping reads by genomic location
6//! - Ordering reads for duplicate marking
7//! - Organizing reads for UMI-based consensus calling
8//!
9//! `ReadInfo` captures the unclipped 5' positions of paired-end reads (or single-end reads),
10//! along with library and cell barcode information for proper grouping.
11
12use anyhow::Result;
13use noodles::sam::alignment::record::data::field::Tag;
14use noodles::sam::header::Header;
15use noodles::sam::header::record::value::map::read_group::tag as rg_tag;
16use std::cmp::Ordering;
17use std::collections::HashMap;
18use std::sync::Arc;
19
20use bstr::ByteSlice;
21
22use crate::sam::record_utils::{
23    mate_unclipped_end, mate_unclipped_start, unclipped_five_prime_position,
24};
25use crate::template::Template;
26use crate::unified_pipeline::GroupKey;
27use noodles::sam;
28use noodles::sam::alignment::record_buf::data::field::value::Value as DataValue;
29
30/// A lookup table mapping read group IDs to library names.
31///
32/// This is built from the SAM header's @RG lines and used to resolve the library
33/// name (LB field) from a record's RG tag. This matches fgbio's behavior where
34/// grouping uses the library name, not the read group ID.
35///
36/// Uses `Arc<str>` for library names to avoid cloning strings for every read.
37///
38/// # Note: `LibraryLookup` vs `LibraryIndex`
39///
40/// Both `LibraryLookup` and [`LibraryIndex`] exist for different use cases:
41/// - `LibraryLookup`: String-based lookup returning library names. Used by
42///   [`ReadInfo::from`] where the actual library name string is needed.
43/// - [`LibraryIndex`]: Hash-based lookup returning numeric indices. Used by
44///   [`compute_group_key`] in the hot path where only equality comparison matters,
45///   avoiding string allocations.
46pub type LibraryLookup = Arc<HashMap<String, Arc<str>>>;
47
48/// Shared "unknown" library string to avoid repeated allocations.
49static UNKNOWN_LIBRARY: std::sync::LazyLock<Arc<str>> =
50    std::sync::LazyLock::new(|| Arc::from("unknown"));
51
52/// Builds a library lookup table from a SAM header.
53///
54/// Iterates through all @RG lines in the header and creates a mapping from
55/// read group ID to library name. If a read group has no LB field, it maps
56/// to "unknown" (matching fgbio's behavior).
57///
58/// # Arguments
59///
60/// * `header` - The SAM header containing @RG lines
61///
62/// # Returns
63///
64/// An `Arc<HashMap>` mapping read group IDs to library names
65#[must_use]
66pub fn build_library_lookup(header: &Header) -> LibraryLookup {
67    let mut lookup = HashMap::new();
68
69    for (id, rg) in header.read_groups() {
70        // Get the LB field from the read group's other_fields
71        let library: Arc<str> = rg
72            .other_fields()
73            .get(&rg_tag::LIBRARY)
74            .map_or_else(|| Arc::clone(&UNKNOWN_LIBRARY), |s| Arc::from(s.to_string()));
75        lookup.insert(id.to_string(), library);
76    }
77
78    Arc::new(lookup)
79}
80
81// ============================================================================
82// LibraryIndex - Fast RG to library index mapping for GroupKey computation
83// ============================================================================
84
85/// Fast lookup from RG tag value to library index.
86///
87/// This provides O(1) library lookup during Decode using string hashing,
88/// replacing the O(n) string comparison in the original `LibraryLookup`.
89#[derive(Debug, Clone)]
90pub struct LibraryIndex {
91    /// Map from RG string hash to library index.
92    lookup: ahash::AHashMap<u64, u16>,
93    /// Library names for each index (for output/debugging).
94    names: Vec<Arc<str>>,
95    /// Unknown library index (always 0).
96    unknown_idx: u16,
97}
98
99impl LibraryIndex {
100    /// Build a library index from a SAM header.
101    ///
102    /// Each unique library name gets a sequential index starting from 0.
103    /// Index 0 is reserved for "unknown" library.
104    ///
105    /// # Panics
106    ///
107    /// Panics if the header contains more than 65,535 distinct libraries.
108    #[must_use]
109    pub fn from_header(header: &Header) -> Self {
110        let mut lookup = ahash::AHashMap::new();
111        let mut names = vec![Arc::clone(&UNKNOWN_LIBRARY)]; // Index 0 = unknown
112        let mut library_to_idx: ahash::AHashMap<Arc<str>, u16> = ahash::AHashMap::new();
113        library_to_idx.insert(Arc::clone(&UNKNOWN_LIBRARY), 0);
114
115        for (id, rg) in header.read_groups() {
116            // Get library name from LB field
117            let library: Arc<str> = rg
118                .other_fields()
119                .get(&rg_tag::LIBRARY)
120                .map_or_else(|| Arc::clone(&UNKNOWN_LIBRARY), |s| Arc::from(s.to_string()));
121
122            // Get or create library index
123            let lib_idx = *library_to_idx.entry(library.clone()).or_insert_with(|| {
124                let idx: u16 =
125                    names.len().try_into().expect("too many distinct libraries for u16 index");
126                names.push(library);
127                idx
128            });
129
130            // Hash the RG string and map to library index
131            let rg_hash = Self::hash_rg(id.as_bytes());
132            lookup.insert(rg_hash, lib_idx);
133        }
134
135        Self { lookup, names, unknown_idx: 0 }
136    }
137
138    /// Get the library index for a read group hash.
139    ///
140    /// Returns 0 (unknown) if the RG hash is not found.
141    #[must_use]
142    pub fn get(&self, rg_hash: u64) -> u16 {
143        *self.lookup.get(&rg_hash).unwrap_or(&self.unknown_idx)
144    }
145
146    /// Get the library name for an index.
147    #[must_use]
148    pub fn library_name(&self, idx: u16) -> &Arc<str> {
149        self.names.get(idx as usize).unwrap_or(&self.names[0])
150    }
151
152    /// Hash a byte slice using `AHash`. Returns 0 for `None`.
153    ///
154    /// This is the single hashing implementation used by all `hash_*` methods.
155    #[must_use]
156    pub fn hash_bytes(bytes: Option<&[u8]>) -> u64 {
157        use ahash::AHasher;
158        use std::hash::{Hash, Hasher};
159        match bytes {
160            Some(b) => {
161                let mut hasher = AHasher::default();
162                b.hash(&mut hasher);
163                hasher.finish()
164            }
165            None => 0,
166        }
167    }
168
169    /// Hash an RG tag value for lookup.
170    #[must_use]
171    pub fn hash_rg(rg_bytes: &[u8]) -> u64 {
172        Self::hash_bytes(Some(rg_bytes))
173    }
174
175    /// Hash a cell barcode for `GroupKey`.
176    #[must_use]
177    pub fn hash_cell_barcode(cell_bytes: Option<&[u8]>) -> u64 {
178        Self::hash_bytes(cell_bytes)
179    }
180
181    /// Hash a read name for `GroupKey`.
182    #[must_use]
183    pub fn hash_name(name_bytes: Option<&[u8]>) -> u64 {
184        Self::hash_bytes(name_bytes)
185    }
186}
187
188impl Default for LibraryIndex {
189    fn default() -> Self {
190        Self {
191            lookup: ahash::AHashMap::new(),
192            names: vec![Arc::clone(&UNKNOWN_LIBRARY)],
193            unknown_idx: 0,
194        }
195    }
196}
197
198// ============================================================================
199// compute_group_key - Pre-compute GroupKey from a single BAM record
200// ============================================================================
201
202/// Compute a `GroupKey` from a single BAM record.
203///
204/// This computes all grouping information from a single record:
205/// - Own position (`ref_id`, unclipped 5' position, strand)
206/// - Mate position (from MC tag, or UNKNOWN if unpaired/missing)
207/// - Library index (from RG tag)
208/// - Cell barcode hash
209/// - Name hash
210///
211/// For paired-end reads with MC tag, both positions are computed and normalized
212/// so the lower position comes first (matching `ReadInfo` behavior).
213///
214/// For unpaired reads or reads without MC tag, mate position uses UNKNOWN sentinels.
215#[must_use]
216#[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
217pub fn compute_group_key(
218    record: &sam::alignment::RecordBuf,
219    library_index: &LibraryIndex,
220    cell_tag: Option<Tag>,
221) -> GroupKey {
222    // Extract name hash
223    let name_hash = LibraryIndex::hash_name(record.name().map(AsRef::as_ref));
224
225    // Skip secondary and supplementary alignments - they get a default key with UNKNOWN_REF.
226    // RecordPositionGrouper either skips them or coalesces them by name_hash.
227    let flags = record.flags();
228    if flags.is_secondary() || flags.is_supplementary() {
229        return GroupKey { name_hash, ..GroupKey::default() };
230    }
231
232    // Own position
233    let ref_id =
234        record.reference_sequence_id().map_or(-1, |id| i32::try_from(id).unwrap_or(i32::MAX));
235    let pos = get_unclipped_position_for_groupkey(record);
236    let strand = u8::from(flags.is_reverse_complemented());
237
238    // Extract library index from RG tag
239    let library_idx = if let Some(DataValue::String(rg_bytes)) = record.data().get(b"RG") {
240        let rg_hash = LibraryIndex::hash_rg(rg_bytes);
241        library_index.get(rg_hash)
242    } else {
243        0 // unknown
244    };
245
246    // Extract cell barcode hash
247    let cell_hash = if let Some(tag) = cell_tag {
248        if let Some(DataValue::String(cb_bytes)) = record.data().get(&tag) {
249            LibraryIndex::hash_cell_barcode(Some(cb_bytes))
250        } else {
251            0
252        }
253    } else {
254        0
255    };
256
257    // Check if paired and has mate info
258    let is_paired = flags.is_segmented();
259    if !is_paired {
260        return GroupKey::single(ref_id, pos, strand, library_idx, cell_hash, name_hash);
261    }
262
263    // Try to get mate position from MC tag
264    let mate_strand = u8::from(flags.is_mate_reverse_complemented());
265    let mate_ref_id =
266        record.mate_reference_sequence_id().map_or(-1, |id| i32::try_from(id).unwrap_or(i32::MAX));
267
268    // Get mate unclipped 5' position based on strand
269    let mate_pos = if mate_strand == 1 {
270        // Reverse strand - 5' is at the end
271        mate_unclipped_end(record).map(|p| p as i32)
272    } else {
273        // Forward strand - 5' is at the start
274        mate_unclipped_start(record).map(|p| p as i32)
275    };
276
277    match mate_pos {
278        Some(mp) => GroupKey::paired(
279            ref_id,
280            pos,
281            strand,
282            mate_ref_id,
283            mp,
284            mate_strand,
285            library_idx,
286            cell_hash,
287            name_hash,
288        ),
289        None => {
290            // No MC tag - fall back to single-end behavior
291            GroupKey::single(ref_id, pos, strand, library_idx, cell_hash, name_hash)
292        }
293    }
294}
295
296/// Get unclipped 5' position for `GroupKey` computation.
297///
298/// Returns 0 for unmapped reads. Returns `i32::MAX` for malformed mapped reads
299/// (those missing alignment start) to avoid incorrectly grouping them at position 0.
300/// Used in the hot path where we need a numeric value for grouping.
301#[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
302fn get_unclipped_position_for_groupkey(record: &sam::alignment::RecordBuf) -> i32 {
303    if record.flags().is_unmapped() {
304        return 0;
305    }
306    unclipped_five_prime_position(record).map_or(i32::MAX, |pos| pos as i32)
307}
308
309/// Information about read positions needed for grouping and ordering.
310///
311/// This structure stores genomic position information for one or two reads (paired-end),
312/// along with library and optional cell barcode information. The R1/R2 positions are
313/// stored in their original order without normalization, matching fgbio's behavior.
314///
315/// # Fields
316///
317/// * `ref_index1` - Reference sequence index for R1 (first read of pair)
318/// * `start1` - Unclipped 5' start position for R1
319/// * `strand1` - Strand for R1 (0=forward, 1=reverse)
320/// * `ref_index2` - Reference sequence index for R2 (or `UNKNOWN_REF` if unpaired)
321/// * `start2` - Unclipped 5' start position for R2
322/// * `strand2` - Strand for R2
323/// * `library` - Library identifier from RG tag
324/// * `cell_barcode` - Optional cell barcode for single-cell applications
325///
326/// # Ordering
327///
328/// `ReadInfo` implements `Ord` matching fgbio's case class field order:
329/// 1. Reference index 1 (R1)
330/// 2. Start position 1 (R1)
331/// 3. Strand 1 (R1)
332/// 4. Reference index 2 (R2)
333/// 5. Start position 2 (R2)
334/// 6. Strand 2 (R2)
335/// 7. Library
336/// 8. Cell barcode (optional)
337#[derive(Debug, Clone, PartialEq, Eq, Hash)]
338pub struct ReadInfo {
339    /// Reference sequence index for first read
340    pub ref_index1: i32,
341    /// Unclipped 5' position for first read
342    pub start1: i32,
343    /// Strand for first read (0=forward, 1=reverse)
344    pub strand1: u8,
345    /// Reference sequence index for second read
346    pub ref_index2: i32,
347    /// Unclipped 5' position for second read
348    pub start2: i32,
349    /// Strand for second read (0=forward, 1=reverse)
350    pub strand2: u8,
351    /// Library identifier (from RG tag). Uses `Arc<str>` to avoid cloning for every read.
352    pub library: Arc<str>,
353    /// Optional cell barcode (for single-cell data)
354    pub cell_barcode: Option<String>,
355}
356
357impl ReadInfo {
358    // Maximum values for unmapped reads
359    const UNKNOWN_REF: i32 = i32::MAX;
360    const UNKNOWN_POS: i32 = i32::MAX;
361    const UNKNOWN_STRAND: u8 = u8::MAX;
362
363    /// Creates a new `ReadInfo`, automatically ordering by lower coordinate first.
364    ///
365    /// This constructor automatically normalizes the order so that the read with the
366    /// genomically earlier position becomes position 1. This matches fgbio's behavior
367    /// where the lower of the two mates' positions comes first.
368    ///
369    /// # Arguments
370    ///
371    /// * `ref_index1` - Reference index for first read
372    /// * `start1` - Start position for first read
373    /// * `strand1` - Strand for first read (0=forward, 1=reverse)
374    /// * `ref_index2` - Reference index for second read
375    /// * `start2` - Start position for second read
376    /// * `strand2` - Strand for second read
377    /// * `library` - Library identifier
378    /// * `cell_barcode` - Optional cell barcode
379    ///
380    /// # Returns
381    ///
382    /// A new `ReadInfo` with reads ordered by genomic position (lower position first)
383    #[must_use]
384    #[allow(clippy::too_many_arguments)]
385    pub fn new(
386        ref_index1: i32,
387        start1: i32,
388        strand1: u8,
389        ref_index2: i32,
390        start2: i32,
391        strand2: u8,
392        library: Arc<str>,
393        cell_barcode: Option<String>,
394    ) -> Self {
395        // Normalize order: put lower genomic position first (matching fgbio)
396        // Uses tuple comparison for cleaner, potentially faster code
397        let r1_earlier = (ref_index1, start1, strand1) <= (ref_index2, start2, strand2);
398
399        if r1_earlier {
400            Self { ref_index1, start1, strand1, ref_index2, start2, strand2, library, cell_barcode }
401        } else {
402            Self {
403                ref_index1: ref_index2,
404                start1: start2,
405                strand1: strand2,
406                ref_index2: ref_index1,
407                start2: start1,
408                strand2: strand1,
409                library,
410                cell_barcode,
411            }
412        }
413    }
414
415    /// Creates `ReadInfo` for a single-end or fragment read.
416    ///
417    /// This constructor is used for unpaired reads or when only one read of a pair
418    /// is available. The second read fields are set to UNKNOWN values.
419    ///
420    /// # Arguments
421    ///
422    /// * `ref_index` - Reference sequence index
423    /// * `start` - Unclipped 5' start position
424    /// * `strand` - Strand (0=forward, 1=reverse)
425    /// * `library` - Library identifier
426    /// * `cell_barcode` - Optional cell barcode
427    ///
428    /// # Returns
429    ///
430    /// A new `ReadInfo` for a single read
431    #[must_use]
432    pub fn single(
433        ref_index: i32,
434        start: i32,
435        strand: u8,
436        library: Arc<str>,
437        cell_barcode: Option<String>,
438    ) -> Self {
439        Self {
440            ref_index1: ref_index,
441            start1: start,
442            strand1: strand,
443            ref_index2: Self::UNKNOWN_REF,
444            start2: Self::UNKNOWN_POS,
445            strand2: Self::UNKNOWN_STRAND,
446            library,
447            cell_barcode,
448        }
449    }
450
451    /// Creates `ReadInfo` for an unmapped read.
452    ///
453    /// All position and reference fields are set to UNKNOWN values since the
454    /// read has no genomic alignment.
455    ///
456    /// # Arguments
457    ///
458    /// * `library` - Library identifier
459    /// * `cell_barcode` - Optional cell barcode
460    ///
461    /// # Returns
462    ///
463    /// A new `ReadInfo` for an unmapped read
464    #[must_use]
465    pub fn unmapped(library: Arc<str>, cell_barcode: Option<String>) -> Self {
466        Self {
467            ref_index1: Self::UNKNOWN_REF,
468            start1: Self::UNKNOWN_POS,
469            strand1: Self::UNKNOWN_STRAND,
470            ref_index2: Self::UNKNOWN_REF,
471            start2: Self::UNKNOWN_POS,
472            strand2: Self::UNKNOWN_STRAND,
473            library,
474            cell_barcode,
475        }
476    }
477
478    /// Converts a strand boolean to a byte representation.
479    ///
480    /// Converts the strand direction to the byte format used by `ReadInfo`:
481    /// - `true` (positive/forward strand) -> 0
482    /// - `false` (negative/reverse strand) -> 1
483    ///
484    /// # Arguments
485    ///
486    /// * `positive` - `true` for forward strand, `false` for reverse
487    ///
488    /// # Returns
489    ///
490    /// Byte representation (0 or 1)
491    #[must_use]
492    pub fn strand_to_byte(positive: bool) -> u8 {
493        u8::from(!positive)
494    }
495
496    /// Builds a `ReadInfo` from a template.
497    ///
498    /// Extracts position information from the template's reads, including unclipped
499    /// 5' positions, library information from the header's @RG line (via library lookup),
500    /// and optional cell barcode.
501    ///
502    /// # Arguments
503    ///
504    /// * `template` - The template containing one or two reads
505    /// * `cell_tag` - SAM tag to use for extracting cell barcode (e.g., "CB")
506    /// * `library_lookup` - Lookup table mapping read group IDs to library names
507    ///
508    /// # Returns
509    ///
510    /// A new `ReadInfo` populated from the template
511    ///
512    /// # Errors
513    ///
514    /// Returns an error if:
515    /// - The template has no records
516    /// - The unclipped position could not be computed (e.g., missing alignment data)
517    pub fn from(
518        template: &Template,
519        cell_tag: Tag,
520        library_lookup: &LibraryLookup,
521    ) -> Result<Self> {
522        // Get reads once and reuse (avoid calling r1()/r2() twice)
523        let r1 = template.r1();
524        let r2 = template.r2();
525        let record = r1.as_ref().or(r2.as_ref()).ok_or_else(|| {
526            anyhow::anyhow!(
527                "Template '{}' has no records (empty template)",
528                String::from_utf8_lossy(&template.name)
529            )
530        })?;
531
532        // Extract library from RG tag, then look up the library name from the header.
533        // This matches fgbio's behavior where grouping uses the library name (LB field),
534        // not the read group ID.
535        // Uses Arc<str> to avoid cloning strings - Arc::clone is just a reference count increment.
536        let library: Arc<str> = if let Some(DataValue::String(rg_bytes)) = record.data().get(b"RG")
537        {
538            // Convert bytes to str for lookup without allocating a String
539            let rg_id = std::str::from_utf8(rg_bytes).unwrap_or("unknown");
540            library_lookup.get(rg_id).cloned().unwrap_or_else(|| Arc::clone(&UNKNOWN_LIBRARY))
541        } else {
542            Arc::clone(&UNKNOWN_LIBRARY)
543        };
544
545        // Extract cell barcode
546        let cell_barcode = if let Some(DataValue::String(cb_str)) = record.data().get(&cell_tag) {
547            Some(String::from_utf8_lossy(cb_str).into_owned())
548        } else {
549            None
550        };
551
552        // For paired-end, extract both positions using UNCLIPPED positions
553        // Reuse r1/r2 bindings instead of calling template.r1()/r2() again
554        #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
555        if let (Some(r1), Some(r2)) = (&r1, &r2) {
556            let ref_index1 = r1.reference_sequence_id().map_or(-1, |id| id as i32);
557            let start1 = get_unclipped_position(r1)?;
558            let strand1 = u8::from(r1.flags().is_reverse_complemented());
559
560            let ref_index2 = r2.reference_sequence_id().map_or(-1, |id| id as i32);
561            let start2 = get_unclipped_position(r2)?;
562            let strand2 = u8::from(r2.flags().is_reverse_complemented());
563
564            Ok(ReadInfo::new(
565                ref_index1,
566                start1,
567                strand1,
568                ref_index2,
569                start2,
570                strand2,
571                library,
572                cell_barcode,
573            ))
574        } else {
575            // Single-end: use ReadInfo::single() which correctly sets UNKNOWN values for R2
576            // This matches fgbio's behavior where missing R2 uses (Int.MaxValue, Int.MaxValue, Byte.MaxValue)
577            let ref_index = record.reference_sequence_id().map_or(-1, |id| id as i32);
578            let start = get_unclipped_position(record)?;
579            let strand = u8::from(record.flags().is_reverse_complemented());
580
581            Ok(ReadInfo::single(ref_index, start, strand, library, cell_barcode))
582        }
583    }
584}
585
586impl PartialOrd for ReadInfo {
587    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
588        Some(self.cmp(other))
589    }
590}
591
592impl Ord for ReadInfo {
593    /// Compares `ReadInfo` in the same order as fgbio's case class field order:
594    /// refIndex1, start1, strand1, refIndex2, start2, strand2, library, cellBarcode
595    fn cmp(&self, other: &Self) -> Ordering {
596        self.ref_index1
597            .cmp(&other.ref_index1)
598            .then_with(|| self.start1.cmp(&other.start1))
599            .then_with(|| self.strand1.cmp(&other.strand1))
600            .then_with(|| self.ref_index2.cmp(&other.ref_index2))
601            .then_with(|| self.start2.cmp(&other.start2))
602            .then_with(|| self.strand2.cmp(&other.strand2))
603            .then_with(|| self.library.cmp(&other.library))
604            .then_with(|| self.cell_barcode.cmp(&other.cell_barcode))
605    }
606}
607
608/// Gets the unclipped 5' position of a read.
609///
610/// For reads with clipped bases (soft or hard), this function calculates the position that would
611/// correspond to the 5' end of the read if it were fully aligned (including clipped bases).
612///
613/// - **Forward strand**: Returns alignment start minus leading clips (soft + hard)
614/// - **Reverse strand**: Returns alignment end plus trailing clips (soft + hard)
615///
616/// This matches htsjdk's `getUnclippedStart` and `getUnclippedEnd` which consider BOTH
617/// soft clips and hard clips.
618///
619/// # Arguments
620///
621/// * `record` - The SAM/BAM record
622///
623/// # Returns
624///
625/// The unclipped 5' position (0 for unmapped reads)
626///
627/// # Errors
628///
629/// Returns an error if the record is mapped but missing required alignment information
630#[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
631fn get_unclipped_position(record: &sam::alignment::RecordBuf) -> Result<i32> {
632    if record.flags().is_unmapped() {
633        return Ok(0);
634    }
635
636    unclipped_five_prime_position(record).map(|pos| pos as i32).ok_or_else(|| {
637        let read_name =
638            record.name().map_or_else(|| "unknown".into(), |n| String::from_utf8_lossy(n.as_ref()));
639        anyhow::anyhow!("Mapped read '{read_name}' missing alignment start position")
640    })
641}
642
643#[cfg(test)]
644mod tests {
645    use super::*;
646
647    #[test]
648    fn test_read_info_ordering() {
649        let info1 = ReadInfo::new(0, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
650        let info2 = ReadInfo::new(0, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
651        assert_eq!(info1, info2);
652    }
653
654    #[test]
655    fn test_read_info_normalizes_order() {
656        // R1 at higher position, R2 at lower position
657        let info1 = ReadInfo::new(0, 200, 1, 0, 100, 0, Arc::from("lib1"), None);
658        // R1 at lower position, R2 at higher position
659        let info2 = ReadInfo::new(0, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
660
661        // Normalization puts lower position first (matching fgbio)
662        // info1 started with (200, 1) then (100, 0), should be normalized to (100, 0) then (200, 1)
663        assert_eq!(info1.ref_index1, 0);
664        assert_eq!(info1.start1, 100);
665        assert_eq!(info1.strand1, 0);
666        assert_eq!(info1.ref_index2, 0);
667        assert_eq!(info1.start2, 200);
668        assert_eq!(info1.strand2, 1);
669
670        // Both should be equal after normalization (matching fgbio behavior)
671        assert_eq!(info1, info2);
672    }
673
674    #[test]
675    fn test_read_info_unmapped() {
676        let info = ReadInfo::unmapped(Arc::from("lib1"), None);
677        assert_eq!(info.ref_index1, ReadInfo::UNKNOWN_REF);
678        assert_eq!(info.start1, ReadInfo::UNKNOWN_POS);
679    }
680
681    #[test]
682    fn test_read_info_single() {
683        let info = ReadInfo::single(1, 500, 0, Arc::from("lib1"), None);
684        assert_eq!(info.ref_index1, 1);
685        assert_eq!(info.start1, 500);
686        assert_eq!(info.strand1, 0);
687        assert_eq!(info.ref_index2, ReadInfo::UNKNOWN_REF);
688    }
689
690    #[test]
691    fn test_strand_to_byte() {
692        assert_eq!(ReadInfo::strand_to_byte(true), 0);
693        assert_eq!(ReadInfo::strand_to_byte(false), 1);
694    }
695
696    #[test]
697    fn test_read_info_comparison() {
698        let info1 = ReadInfo::new(0, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
699        let info2 = ReadInfo::new(0, 150, 0, 0, 200, 1, Arc::from("lib1"), None);
700
701        assert!(info1 < info2);
702    }
703
704    #[test]
705    fn test_different_chromosomes() {
706        // R1 on chr0, R2 on chr1 - chr0 is earlier, so R1 stays first
707        let info1 = ReadInfo::new(0, 100, 0, 1, 200, 1, Arc::from("lib1"), None);
708        // R1 on chr1, R2 on chr0 - chr0 is earlier, so R2 becomes first after normalization
709        let info2 = ReadInfo::new(1, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
710
711        // Normalization: lower ref_index comes first
712        // info1: chr0 < chr1, so keeps original (ref_index1=0)
713        assert_eq!(info1.ref_index1, 0);
714        assert_eq!(info1.start1, 100);
715
716        // info2: chr1 > chr0, so swaps to put chr0 first (ref_index1=0)
717        assert_eq!(info2.ref_index1, 0);
718        assert_eq!(info2.start1, 200); // This was R2's position
719
720        // These are different ReadInfo values (different positions within each slot)
721        assert_ne!(info1, info2);
722    }
723
724    #[test]
725    fn test_build_library_lookup() {
726        use bstr::BString;
727        use noodles::sam::header::record::value::Map;
728        use noodles::sam::header::record::value::map::ReadGroup;
729        use noodles::sam::header::record::value::map::read_group::tag as rg_tag;
730
731        // Build a header with two read groups having different libraries
732        let mut header = Header::builder();
733
734        // First read group with library "libA"
735        let rg1 = Map::<ReadGroup>::builder()
736            .insert(rg_tag::LIBRARY, String::from("libA"))
737            .build()
738            .unwrap();
739        header = header.add_read_group(BString::from("RG1"), rg1);
740
741        // Second read group with library "libB"
742        let rg2 = Map::<ReadGroup>::builder()
743            .insert(rg_tag::LIBRARY, String::from("libB"))
744            .build()
745            .unwrap();
746        header = header.add_read_group(BString::from("RG2"), rg2);
747
748        // Third read group with no library (should default to "unknown")
749        let rg3 = Map::<ReadGroup>::builder().build().unwrap();
750        header = header.add_read_group(BString::from("RG3"), rg3);
751
752        let header = header.build();
753        let lookup = super::build_library_lookup(&header);
754
755        // Verify the lookup - compare Arc<str> values
756        assert_eq!(lookup.len(), 3);
757        assert_eq!(lookup.get("RG1").map(AsRef::as_ref), Some("libA"));
758        assert_eq!(lookup.get("RG2").map(AsRef::as_ref), Some("libB"));
759        assert_eq!(lookup.get("RG3").map(AsRef::as_ref), Some("unknown"));
760    }
761
762    #[test]
763    fn test_single_uses_unknown_values() {
764        // Single-end reads should use UNKNOWN values for R2 (matching fgbio)
765        let info = ReadInfo::single(1, 500, 0, Arc::from("lib1"), None);
766        assert_eq!(info.ref_index1, 1);
767        assert_eq!(info.start1, 500);
768        assert_eq!(info.strand1, 0);
769        // R2 should have UNKNOWN values (i32::MAX, i32::MAX, u8::MAX)
770        assert_eq!(info.ref_index2, i32::MAX);
771        assert_eq!(info.start2, i32::MAX);
772        assert_eq!(info.strand2, u8::MAX);
773    }
774
775    #[test]
776    fn test_single_sorts_after_paired() {
777        // Single-end reads should sort after paired-end reads because they have UNKNOWN (MAX) values
778        let paired = ReadInfo::new(0, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
779        let single = ReadInfo::single(0, 100, 0, Arc::from("lib1"), None);
780
781        // Single should be greater than paired (sorts after)
782        assert!(single > paired);
783    }
784
785    #[test]
786    fn test_ord_includes_cell_barcode() {
787        // Two ReadInfos that differ only in cell_barcode should compare differently
788        let lib: Arc<str> = Arc::from("lib1");
789        let info1 = ReadInfo::new(0, 100, 0, 0, 200, 1, lib.clone(), Some("cell1".to_string()));
790        let info2 = ReadInfo::new(0, 100, 0, 0, 200, 1, lib.clone(), Some("cell2".to_string()));
791        let info3 = ReadInfo::new(0, 100, 0, 0, 200, 1, lib, None);
792
793        // cell1 < cell2 (alphabetically)
794        assert!(info1 < info2);
795        // None < Some(_) in Rust's Option ordering
796        assert!(info3 < info1);
797        // All three are different
798        assert_ne!(info1, info2);
799        assert_ne!(info1, info3);
800    }
801
802    #[test]
803    fn test_normalization_with_strand_tiebreaker() {
804        // Same ref and position, but different strands - should use strand as tiebreaker
805        // Forward (0) is "earlier" than reverse (1)
806        let info1 = ReadInfo::new(0, 100, 1, 0, 100, 0, Arc::from("lib1"), None);
807        // After normalization, strand 0 should come first
808        assert_eq!(info1.strand1, 0);
809        assert_eq!(info1.strand2, 1);
810    }
811
812    mod get_unclipped_position_tests {
813        use super::*;
814        use crate::sam::builder::RecordBuilder;
815        use noodles::sam::alignment::RecordBuf;
816
817        /// Helper to build a test record with specific CIGAR and strand
818        fn build_record(alignment_start: usize, cigar: &str, reverse: bool) -> RecordBuf {
819            RecordBuilder::new()
820                .sequence("ACGT") // Minimal sequence
821                .alignment_start(alignment_start)
822                .cigar(cigar)
823                .reverse_complement(reverse)
824                .build()
825        }
826
827        #[test]
828        fn test_forward_strand_soft_clip_only() {
829            // Forward strand with 5S at start: alignment_start=100, CIGAR=5S50M
830            // unclipped_start = 100 - 5 = 95
831            let record = build_record(100, "5S50M", false);
832            let pos = get_unclipped_position(&record).unwrap();
833            assert_eq!(pos, 95);
834        }
835
836        #[test]
837        fn test_forward_strand_hard_clip_only() {
838            // Forward strand with 10H at start: alignment_start=100, CIGAR=10H50M
839            // unclipped_start = 100 - 10 = 90
840            let record = build_record(100, "10H50M", false);
841            let pos = get_unclipped_position(&record).unwrap();
842            assert_eq!(pos, 90);
843        }
844
845        #[test]
846        fn test_forward_strand_hard_and_soft_clip() {
847            // Forward strand with 10H5S at start: alignment_start=100, CIGAR=10H5S50M
848            // unclipped_start = 100 - 10 - 5 = 85
849            let record = build_record(100, "10H5S50M", false);
850            let pos = get_unclipped_position(&record).unwrap();
851            assert_eq!(pos, 85);
852        }
853
854        #[test]
855        fn test_reverse_strand_soft_clip_only() {
856            // Reverse strand with 5S at end: alignment_start=100, CIGAR=50M5S
857            // alignment_end = 100 + 50 - 1 = 149
858            // unclipped_end = 149 + 5 = 154
859            let record = build_record(100, "50M5S", true);
860            let pos = get_unclipped_position(&record).unwrap();
861            assert_eq!(pos, 154);
862        }
863
864        #[test]
865        fn test_reverse_strand_hard_clip_only() {
866            // Reverse strand with 10H at end: alignment_start=100, CIGAR=50M10H
867            // alignment_end = 100 + 50 - 1 = 149
868            // unclipped_end = 149 + 10 = 159
869            let record = build_record(100, "50M10H", true);
870            let pos = get_unclipped_position(&record).unwrap();
871            assert_eq!(pos, 159);
872        }
873
874        #[test]
875        fn test_reverse_strand_hard_and_soft_clip() {
876            // Reverse strand with 5S10H at end: alignment_start=100, CIGAR=50M5S10H
877            // alignment_end = 100 + 50 - 1 = 149
878            // unclipped_end = 149 + 5 + 10 = 164
879            let record = build_record(100, "50M5S10H", true);
880            let pos = get_unclipped_position(&record).unwrap();
881            assert_eq!(pos, 164);
882        }
883
884        #[test]
885        fn test_no_clipping() {
886            // No clipping: alignment_start=100, CIGAR=50M
887            // Forward: unclipped_start = 100
888            let forward_record = build_record(100, "50M", false);
889            assert_eq!(get_unclipped_position(&forward_record).unwrap(), 100);
890
891            // Reverse: unclipped_end = 100 + 50 - 1 = 149
892            let reverse_record = build_record(100, "50M", true);
893            assert_eq!(get_unclipped_position(&reverse_record).unwrap(), 149);
894        }
895
896        #[test]
897        fn test_unmapped_read() {
898            let record = RecordBuilder::new().sequence("ACGT").unmapped(true).build();
899            assert_eq!(get_unclipped_position(&record).unwrap(), 0);
900        }
901
902        #[test]
903        fn test_complex_cigar_with_insertions_deletions() {
904            // Complex CIGAR: 5H3S10M2I5M3D10M4S6H
905            // Forward strand: alignment_start=100
906            // Leading clips = 5H + 3S = 8
907            // unclipped_start = 100 - 8 = 92
908            let forward_record = build_record(100, "5H3S10M2I5M3D10M4S6H", false);
909            assert_eq!(get_unclipped_position(&forward_record).unwrap(), 92);
910
911            // Reverse strand: same CIGAR
912            // alignment_span = 10 + 5 + 3 + 10 = 28 (M + D consume reference)
913            // alignment_end = 100 + 28 - 1 = 127
914            // Trailing clips = 4S + 6H = 10
915            // unclipped_end = 127 + 10 = 137
916            let reverse_record = build_record(100, "5H3S10M2I5M3D10M4S6H", true);
917            assert_eq!(get_unclipped_position(&reverse_record).unwrap(), 137);
918        }
919    }
920}