fgumi_lib/read_info.rs
1//! Data structures for tracking read position information.
2//!
3//! This module provides the `ReadInfo` structure for tracking genomic position information
4//! about reads, which is essential for:
5//! - Grouping reads by genomic location
6//! - Ordering reads for duplicate marking
7//! - Organizing reads for UMI-based consensus calling
8//!
9//! `ReadInfo` captures the unclipped 5' positions of paired-end reads (or single-end reads),
10//! along with library and cell barcode information for proper grouping.
11
12use anyhow::Result;
13use noodles::sam::alignment::record::data::field::Tag;
14use noodles::sam::header::Header;
15use noodles::sam::header::record::value::map::read_group::tag as rg_tag;
16use std::cmp::Ordering;
17use std::collections::HashMap;
18use std::sync::Arc;
19
20use bstr::ByteSlice;
21
22use crate::sam::record_utils::{
23 mate_unclipped_end, mate_unclipped_start, unclipped_five_prime_position,
24};
25use crate::template::Template;
26use crate::unified_pipeline::GroupKey;
27use noodles::sam;
28use noodles::sam::alignment::record_buf::data::field::value::Value as DataValue;
29
30/// A lookup table mapping read group IDs to library names.
31///
32/// This is built from the SAM header's @RG lines and used to resolve the library
33/// name (LB field) from a record's RG tag. This matches fgbio's behavior where
34/// grouping uses the library name, not the read group ID.
35///
36/// Uses `Arc<str>` for library names to avoid cloning strings for every read.
37///
38/// # Note: `LibraryLookup` vs `LibraryIndex`
39///
40/// Both `LibraryLookup` and [`LibraryIndex`] exist for different use cases:
41/// - `LibraryLookup`: String-based lookup returning library names. Used by
42/// [`ReadInfo::from`] where the actual library name string is needed.
43/// - [`LibraryIndex`]: Hash-based lookup returning numeric indices. Used by
44/// [`compute_group_key`] in the hot path where only equality comparison matters,
45/// avoiding string allocations.
46pub type LibraryLookup = Arc<HashMap<String, Arc<str>>>;
47
48/// Shared "unknown" library string to avoid repeated allocations.
49static UNKNOWN_LIBRARY: std::sync::LazyLock<Arc<str>> =
50 std::sync::LazyLock::new(|| Arc::from("unknown"));
51
52/// Builds a library lookup table from a SAM header.
53///
54/// Iterates through all @RG lines in the header and creates a mapping from
55/// read group ID to library name. If a read group has no LB field, it maps
56/// to "unknown" (matching fgbio's behavior).
57///
58/// # Arguments
59///
60/// * `header` - The SAM header containing @RG lines
61///
62/// # Returns
63///
64/// An `Arc<HashMap>` mapping read group IDs to library names
65#[must_use]
66pub fn build_library_lookup(header: &Header) -> LibraryLookup {
67 let mut lookup = HashMap::new();
68
69 for (id, rg) in header.read_groups() {
70 // Get the LB field from the read group's other_fields
71 let library: Arc<str> = rg
72 .other_fields()
73 .get(&rg_tag::LIBRARY)
74 .map_or_else(|| Arc::clone(&UNKNOWN_LIBRARY), |s| Arc::from(s.to_string()));
75 lookup.insert(id.to_string(), library);
76 }
77
78 Arc::new(lookup)
79}
80
81// ============================================================================
82// LibraryIndex - Fast RG to library index mapping for GroupKey computation
83// ============================================================================
84
85/// Fast lookup from RG tag value to library index.
86///
87/// This provides O(1) library lookup during Decode using string hashing,
88/// replacing the O(n) string comparison in the original `LibraryLookup`.
89#[derive(Debug, Clone)]
90pub struct LibraryIndex {
91 /// Map from RG string hash to library index.
92 lookup: ahash::AHashMap<u64, u16>,
93 /// Library names for each index (for output/debugging).
94 names: Vec<Arc<str>>,
95 /// Unknown library index (always 0).
96 unknown_idx: u16,
97}
98
99impl LibraryIndex {
100 /// Build a library index from a SAM header.
101 ///
102 /// Each unique library name gets a sequential index starting from 0.
103 /// Index 0 is reserved for "unknown" library.
104 ///
105 /// # Panics
106 ///
107 /// Panics if the header contains more than 65,535 distinct libraries.
108 #[must_use]
109 pub fn from_header(header: &Header) -> Self {
110 let mut lookup = ahash::AHashMap::new();
111 let mut names = vec![Arc::clone(&UNKNOWN_LIBRARY)]; // Index 0 = unknown
112 let mut library_to_idx: ahash::AHashMap<Arc<str>, u16> = ahash::AHashMap::new();
113 library_to_idx.insert(Arc::clone(&UNKNOWN_LIBRARY), 0);
114
115 for (id, rg) in header.read_groups() {
116 // Get library name from LB field
117 let library: Arc<str> = rg
118 .other_fields()
119 .get(&rg_tag::LIBRARY)
120 .map_or_else(|| Arc::clone(&UNKNOWN_LIBRARY), |s| Arc::from(s.to_string()));
121
122 // Get or create library index
123 let lib_idx = *library_to_idx.entry(library.clone()).or_insert_with(|| {
124 let idx: u16 =
125 names.len().try_into().expect("too many distinct libraries for u16 index");
126 names.push(library);
127 idx
128 });
129
130 // Hash the RG string and map to library index
131 let rg_hash = Self::hash_rg(id.as_bytes());
132 lookup.insert(rg_hash, lib_idx);
133 }
134
135 Self { lookup, names, unknown_idx: 0 }
136 }
137
138 /// Get the library index for a read group hash.
139 ///
140 /// Returns 0 (unknown) if the RG hash is not found.
141 #[must_use]
142 pub fn get(&self, rg_hash: u64) -> u16 {
143 *self.lookup.get(&rg_hash).unwrap_or(&self.unknown_idx)
144 }
145
146 /// Get the library name for an index.
147 #[must_use]
148 pub fn library_name(&self, idx: u16) -> &Arc<str> {
149 self.names.get(idx as usize).unwrap_or(&self.names[0])
150 }
151
152 /// Hash a byte slice using `AHash`. Returns 0 for `None`.
153 ///
154 /// This is the single hashing implementation used by all `hash_*` methods.
155 #[must_use]
156 pub fn hash_bytes(bytes: Option<&[u8]>) -> u64 {
157 use ahash::AHasher;
158 use std::hash::{Hash, Hasher};
159 match bytes {
160 Some(b) => {
161 let mut hasher = AHasher::default();
162 b.hash(&mut hasher);
163 hasher.finish()
164 }
165 None => 0,
166 }
167 }
168
169 /// Hash an RG tag value for lookup.
170 #[must_use]
171 pub fn hash_rg(rg_bytes: &[u8]) -> u64 {
172 Self::hash_bytes(Some(rg_bytes))
173 }
174
175 /// Hash a cell barcode for `GroupKey`.
176 #[must_use]
177 pub fn hash_cell_barcode(cell_bytes: Option<&[u8]>) -> u64 {
178 Self::hash_bytes(cell_bytes)
179 }
180
181 /// Hash a read name for `GroupKey`.
182 #[must_use]
183 pub fn hash_name(name_bytes: Option<&[u8]>) -> u64 {
184 Self::hash_bytes(name_bytes)
185 }
186}
187
188impl Default for LibraryIndex {
189 fn default() -> Self {
190 Self {
191 lookup: ahash::AHashMap::new(),
192 names: vec![Arc::clone(&UNKNOWN_LIBRARY)],
193 unknown_idx: 0,
194 }
195 }
196}
197
198// ============================================================================
199// compute_group_key - Pre-compute GroupKey from a single BAM record
200// ============================================================================
201
202/// Compute a `GroupKey` from a single BAM record.
203///
204/// This computes all grouping information from a single record:
205/// - Own position (`ref_id`, unclipped 5' position, strand)
206/// - Mate position (from MC tag, or UNKNOWN if unpaired/missing)
207/// - Library index (from RG tag)
208/// - Cell barcode hash
209/// - Name hash
210///
211/// For paired-end reads with MC tag, both positions are computed and normalized
212/// so the lower position comes first (matching `ReadInfo` behavior).
213///
214/// For unpaired reads or reads without MC tag, mate position uses UNKNOWN sentinels.
215#[must_use]
216#[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
217pub fn compute_group_key(
218 record: &sam::alignment::RecordBuf,
219 library_index: &LibraryIndex,
220 cell_tag: Option<Tag>,
221) -> GroupKey {
222 // Extract name hash
223 let name_hash = LibraryIndex::hash_name(record.name().map(AsRef::as_ref));
224
225 // Skip secondary and supplementary alignments - they get a default key with UNKNOWN_REF.
226 // RecordPositionGrouper either skips them or coalesces them by name_hash.
227 let flags = record.flags();
228 if flags.is_secondary() || flags.is_supplementary() {
229 return GroupKey { name_hash, ..GroupKey::default() };
230 }
231
232 // Own position
233 let ref_id =
234 record.reference_sequence_id().map_or(-1, |id| i32::try_from(id).unwrap_or(i32::MAX));
235 let pos = get_unclipped_position_for_groupkey(record);
236 let strand = u8::from(flags.is_reverse_complemented());
237
238 // Extract library index from RG tag
239 let library_idx = if let Some(DataValue::String(rg_bytes)) = record.data().get(b"RG") {
240 let rg_hash = LibraryIndex::hash_rg(rg_bytes);
241 library_index.get(rg_hash)
242 } else {
243 0 // unknown
244 };
245
246 // Extract cell barcode hash
247 let cell_hash = if let Some(tag) = cell_tag {
248 if let Some(DataValue::String(cb_bytes)) = record.data().get(&tag) {
249 LibraryIndex::hash_cell_barcode(Some(cb_bytes))
250 } else {
251 0
252 }
253 } else {
254 0
255 };
256
257 // Check if paired and has mate info
258 let is_paired = flags.is_segmented();
259 if !is_paired {
260 return GroupKey::single(ref_id, pos, strand, library_idx, cell_hash, name_hash);
261 }
262
263 // Try to get mate position from MC tag
264 let mate_strand = u8::from(flags.is_mate_reverse_complemented());
265 let mate_ref_id =
266 record.mate_reference_sequence_id().map_or(-1, |id| i32::try_from(id).unwrap_or(i32::MAX));
267
268 // Get mate unclipped 5' position based on strand
269 let mate_pos = if mate_strand == 1 {
270 // Reverse strand - 5' is at the end
271 mate_unclipped_end(record).map(|p| p as i32)
272 } else {
273 // Forward strand - 5' is at the start
274 mate_unclipped_start(record).map(|p| p as i32)
275 };
276
277 match mate_pos {
278 Some(mp) => GroupKey::paired(
279 ref_id,
280 pos,
281 strand,
282 mate_ref_id,
283 mp,
284 mate_strand,
285 library_idx,
286 cell_hash,
287 name_hash,
288 ),
289 None => {
290 // No MC tag - fall back to single-end behavior
291 GroupKey::single(ref_id, pos, strand, library_idx, cell_hash, name_hash)
292 }
293 }
294}
295
296/// Get unclipped 5' position for `GroupKey` computation.
297///
298/// Returns 0 for unmapped reads. Returns `i32::MAX` for malformed mapped reads
299/// (those missing alignment start) to avoid incorrectly grouping them at position 0.
300/// Used in the hot path where we need a numeric value for grouping.
301#[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
302fn get_unclipped_position_for_groupkey(record: &sam::alignment::RecordBuf) -> i32 {
303 if record.flags().is_unmapped() {
304 return 0;
305 }
306 unclipped_five_prime_position(record).map_or(i32::MAX, |pos| pos as i32)
307}
308
309/// Information about read positions needed for grouping and ordering.
310///
311/// This structure stores genomic position information for one or two reads (paired-end),
312/// along with library and optional cell barcode information. The R1/R2 positions are
313/// stored in their original order without normalization, matching fgbio's behavior.
314///
315/// # Fields
316///
317/// * `ref_index1` - Reference sequence index for R1 (first read of pair)
318/// * `start1` - Unclipped 5' start position for R1
319/// * `strand1` - Strand for R1 (0=forward, 1=reverse)
320/// * `ref_index2` - Reference sequence index for R2 (or `UNKNOWN_REF` if unpaired)
321/// * `start2` - Unclipped 5' start position for R2
322/// * `strand2` - Strand for R2
323/// * `library` - Library identifier from RG tag
324/// * `cell_barcode` - Optional cell barcode for single-cell applications
325///
326/// # Ordering
327///
328/// `ReadInfo` implements `Ord` matching fgbio's case class field order:
329/// 1. Reference index 1 (R1)
330/// 2. Start position 1 (R1)
331/// 3. Strand 1 (R1)
332/// 4. Reference index 2 (R2)
333/// 5. Start position 2 (R2)
334/// 6. Strand 2 (R2)
335/// 7. Library
336/// 8. Cell barcode (optional)
337#[derive(Debug, Clone, PartialEq, Eq, Hash)]
338pub struct ReadInfo {
339 /// Reference sequence index for first read
340 pub ref_index1: i32,
341 /// Unclipped 5' position for first read
342 pub start1: i32,
343 /// Strand for first read (0=forward, 1=reverse)
344 pub strand1: u8,
345 /// Reference sequence index for second read
346 pub ref_index2: i32,
347 /// Unclipped 5' position for second read
348 pub start2: i32,
349 /// Strand for second read (0=forward, 1=reverse)
350 pub strand2: u8,
351 /// Library identifier (from RG tag). Uses `Arc<str>` to avoid cloning for every read.
352 pub library: Arc<str>,
353 /// Optional cell barcode (for single-cell data)
354 pub cell_barcode: Option<String>,
355}
356
357impl ReadInfo {
358 // Maximum values for unmapped reads
359 const UNKNOWN_REF: i32 = i32::MAX;
360 const UNKNOWN_POS: i32 = i32::MAX;
361 const UNKNOWN_STRAND: u8 = u8::MAX;
362
363 /// Creates a new `ReadInfo`, automatically ordering by lower coordinate first.
364 ///
365 /// This constructor automatically normalizes the order so that the read with the
366 /// genomically earlier position becomes position 1. This matches fgbio's behavior
367 /// where the lower of the two mates' positions comes first.
368 ///
369 /// # Arguments
370 ///
371 /// * `ref_index1` - Reference index for first read
372 /// * `start1` - Start position for first read
373 /// * `strand1` - Strand for first read (0=forward, 1=reverse)
374 /// * `ref_index2` - Reference index for second read
375 /// * `start2` - Start position for second read
376 /// * `strand2` - Strand for second read
377 /// * `library` - Library identifier
378 /// * `cell_barcode` - Optional cell barcode
379 ///
380 /// # Returns
381 ///
382 /// A new `ReadInfo` with reads ordered by genomic position (lower position first)
383 #[must_use]
384 #[allow(clippy::too_many_arguments)]
385 pub fn new(
386 ref_index1: i32,
387 start1: i32,
388 strand1: u8,
389 ref_index2: i32,
390 start2: i32,
391 strand2: u8,
392 library: Arc<str>,
393 cell_barcode: Option<String>,
394 ) -> Self {
395 // Normalize order: put lower genomic position first (matching fgbio)
396 // Uses tuple comparison for cleaner, potentially faster code
397 let r1_earlier = (ref_index1, start1, strand1) <= (ref_index2, start2, strand2);
398
399 if r1_earlier {
400 Self { ref_index1, start1, strand1, ref_index2, start2, strand2, library, cell_barcode }
401 } else {
402 Self {
403 ref_index1: ref_index2,
404 start1: start2,
405 strand1: strand2,
406 ref_index2: ref_index1,
407 start2: start1,
408 strand2: strand1,
409 library,
410 cell_barcode,
411 }
412 }
413 }
414
415 /// Creates `ReadInfo` for a single-end or fragment read.
416 ///
417 /// This constructor is used for unpaired reads or when only one read of a pair
418 /// is available. The second read fields are set to UNKNOWN values.
419 ///
420 /// # Arguments
421 ///
422 /// * `ref_index` - Reference sequence index
423 /// * `start` - Unclipped 5' start position
424 /// * `strand` - Strand (0=forward, 1=reverse)
425 /// * `library` - Library identifier
426 /// * `cell_barcode` - Optional cell barcode
427 ///
428 /// # Returns
429 ///
430 /// A new `ReadInfo` for a single read
431 #[must_use]
432 pub fn single(
433 ref_index: i32,
434 start: i32,
435 strand: u8,
436 library: Arc<str>,
437 cell_barcode: Option<String>,
438 ) -> Self {
439 Self {
440 ref_index1: ref_index,
441 start1: start,
442 strand1: strand,
443 ref_index2: Self::UNKNOWN_REF,
444 start2: Self::UNKNOWN_POS,
445 strand2: Self::UNKNOWN_STRAND,
446 library,
447 cell_barcode,
448 }
449 }
450
451 /// Creates `ReadInfo` for an unmapped read.
452 ///
453 /// All position and reference fields are set to UNKNOWN values since the
454 /// read has no genomic alignment.
455 ///
456 /// # Arguments
457 ///
458 /// * `library` - Library identifier
459 /// * `cell_barcode` - Optional cell barcode
460 ///
461 /// # Returns
462 ///
463 /// A new `ReadInfo` for an unmapped read
464 #[must_use]
465 pub fn unmapped(library: Arc<str>, cell_barcode: Option<String>) -> Self {
466 Self {
467 ref_index1: Self::UNKNOWN_REF,
468 start1: Self::UNKNOWN_POS,
469 strand1: Self::UNKNOWN_STRAND,
470 ref_index2: Self::UNKNOWN_REF,
471 start2: Self::UNKNOWN_POS,
472 strand2: Self::UNKNOWN_STRAND,
473 library,
474 cell_barcode,
475 }
476 }
477
478 /// Converts a strand boolean to a byte representation.
479 ///
480 /// Converts the strand direction to the byte format used by `ReadInfo`:
481 /// - `true` (positive/forward strand) -> 0
482 /// - `false` (negative/reverse strand) -> 1
483 ///
484 /// # Arguments
485 ///
486 /// * `positive` - `true` for forward strand, `false` for reverse
487 ///
488 /// # Returns
489 ///
490 /// Byte representation (0 or 1)
491 #[must_use]
492 pub fn strand_to_byte(positive: bool) -> u8 {
493 u8::from(!positive)
494 }
495
496 /// Builds a `ReadInfo` from a template.
497 ///
498 /// Extracts position information from the template's reads, including unclipped
499 /// 5' positions, library information from the header's @RG line (via library lookup),
500 /// and optional cell barcode.
501 ///
502 /// # Arguments
503 ///
504 /// * `template` - The template containing one or two reads
505 /// * `cell_tag` - SAM tag to use for extracting cell barcode (e.g., "CB")
506 /// * `library_lookup` - Lookup table mapping read group IDs to library names
507 ///
508 /// # Returns
509 ///
510 /// A new `ReadInfo` populated from the template
511 ///
512 /// # Errors
513 ///
514 /// Returns an error if:
515 /// - The template has no records
516 /// - The unclipped position could not be computed (e.g., missing alignment data)
517 pub fn from(
518 template: &Template,
519 cell_tag: Tag,
520 library_lookup: &LibraryLookup,
521 ) -> Result<Self> {
522 // Get reads once and reuse (avoid calling r1()/r2() twice)
523 let r1 = template.r1();
524 let r2 = template.r2();
525 let record = r1.as_ref().or(r2.as_ref()).ok_or_else(|| {
526 anyhow::anyhow!(
527 "Template '{}' has no records (empty template)",
528 String::from_utf8_lossy(&template.name)
529 )
530 })?;
531
532 // Extract library from RG tag, then look up the library name from the header.
533 // This matches fgbio's behavior where grouping uses the library name (LB field),
534 // not the read group ID.
535 // Uses Arc<str> to avoid cloning strings - Arc::clone is just a reference count increment.
536 let library: Arc<str> = if let Some(DataValue::String(rg_bytes)) = record.data().get(b"RG")
537 {
538 // Convert bytes to str for lookup without allocating a String
539 let rg_id = std::str::from_utf8(rg_bytes).unwrap_or("unknown");
540 library_lookup.get(rg_id).cloned().unwrap_or_else(|| Arc::clone(&UNKNOWN_LIBRARY))
541 } else {
542 Arc::clone(&UNKNOWN_LIBRARY)
543 };
544
545 // Extract cell barcode
546 let cell_barcode = if let Some(DataValue::String(cb_str)) = record.data().get(&cell_tag) {
547 Some(String::from_utf8_lossy(cb_str).into_owned())
548 } else {
549 None
550 };
551
552 // For paired-end, extract both positions using UNCLIPPED positions
553 // Reuse r1/r2 bindings instead of calling template.r1()/r2() again
554 #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
555 if let (Some(r1), Some(r2)) = (&r1, &r2) {
556 let ref_index1 = r1.reference_sequence_id().map_or(-1, |id| id as i32);
557 let start1 = get_unclipped_position(r1)?;
558 let strand1 = u8::from(r1.flags().is_reverse_complemented());
559
560 let ref_index2 = r2.reference_sequence_id().map_or(-1, |id| id as i32);
561 let start2 = get_unclipped_position(r2)?;
562 let strand2 = u8::from(r2.flags().is_reverse_complemented());
563
564 Ok(ReadInfo::new(
565 ref_index1,
566 start1,
567 strand1,
568 ref_index2,
569 start2,
570 strand2,
571 library,
572 cell_barcode,
573 ))
574 } else {
575 // Single-end: use ReadInfo::single() which correctly sets UNKNOWN values for R2
576 // This matches fgbio's behavior where missing R2 uses (Int.MaxValue, Int.MaxValue, Byte.MaxValue)
577 let ref_index = record.reference_sequence_id().map_or(-1, |id| id as i32);
578 let start = get_unclipped_position(record)?;
579 let strand = u8::from(record.flags().is_reverse_complemented());
580
581 Ok(ReadInfo::single(ref_index, start, strand, library, cell_barcode))
582 }
583 }
584}
585
586impl PartialOrd for ReadInfo {
587 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
588 Some(self.cmp(other))
589 }
590}
591
592impl Ord for ReadInfo {
593 /// Compares `ReadInfo` in the same order as fgbio's case class field order:
594 /// refIndex1, start1, strand1, refIndex2, start2, strand2, library, cellBarcode
595 fn cmp(&self, other: &Self) -> Ordering {
596 self.ref_index1
597 .cmp(&other.ref_index1)
598 .then_with(|| self.start1.cmp(&other.start1))
599 .then_with(|| self.strand1.cmp(&other.strand1))
600 .then_with(|| self.ref_index2.cmp(&other.ref_index2))
601 .then_with(|| self.start2.cmp(&other.start2))
602 .then_with(|| self.strand2.cmp(&other.strand2))
603 .then_with(|| self.library.cmp(&other.library))
604 .then_with(|| self.cell_barcode.cmp(&other.cell_barcode))
605 }
606}
607
608/// Gets the unclipped 5' position of a read.
609///
610/// For reads with clipped bases (soft or hard), this function calculates the position that would
611/// correspond to the 5' end of the read if it were fully aligned (including clipped bases).
612///
613/// - **Forward strand**: Returns alignment start minus leading clips (soft + hard)
614/// - **Reverse strand**: Returns alignment end plus trailing clips (soft + hard)
615///
616/// This matches htsjdk's `getUnclippedStart` and `getUnclippedEnd` which consider BOTH
617/// soft clips and hard clips.
618///
619/// # Arguments
620///
621/// * `record` - The SAM/BAM record
622///
623/// # Returns
624///
625/// The unclipped 5' position (0 for unmapped reads)
626///
627/// # Errors
628///
629/// Returns an error if the record is mapped but missing required alignment information
630#[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
631fn get_unclipped_position(record: &sam::alignment::RecordBuf) -> Result<i32> {
632 if record.flags().is_unmapped() {
633 return Ok(0);
634 }
635
636 unclipped_five_prime_position(record).map(|pos| pos as i32).ok_or_else(|| {
637 let read_name =
638 record.name().map_or_else(|| "unknown".into(), |n| String::from_utf8_lossy(n.as_ref()));
639 anyhow::anyhow!("Mapped read '{read_name}' missing alignment start position")
640 })
641}
642
643#[cfg(test)]
644mod tests {
645 use super::*;
646
647 #[test]
648 fn test_read_info_ordering() {
649 let info1 = ReadInfo::new(0, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
650 let info2 = ReadInfo::new(0, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
651 assert_eq!(info1, info2);
652 }
653
654 #[test]
655 fn test_read_info_normalizes_order() {
656 // R1 at higher position, R2 at lower position
657 let info1 = ReadInfo::new(0, 200, 1, 0, 100, 0, Arc::from("lib1"), None);
658 // R1 at lower position, R2 at higher position
659 let info2 = ReadInfo::new(0, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
660
661 // Normalization puts lower position first (matching fgbio)
662 // info1 started with (200, 1) then (100, 0), should be normalized to (100, 0) then (200, 1)
663 assert_eq!(info1.ref_index1, 0);
664 assert_eq!(info1.start1, 100);
665 assert_eq!(info1.strand1, 0);
666 assert_eq!(info1.ref_index2, 0);
667 assert_eq!(info1.start2, 200);
668 assert_eq!(info1.strand2, 1);
669
670 // Both should be equal after normalization (matching fgbio behavior)
671 assert_eq!(info1, info2);
672 }
673
674 #[test]
675 fn test_read_info_unmapped() {
676 let info = ReadInfo::unmapped(Arc::from("lib1"), None);
677 assert_eq!(info.ref_index1, ReadInfo::UNKNOWN_REF);
678 assert_eq!(info.start1, ReadInfo::UNKNOWN_POS);
679 }
680
681 #[test]
682 fn test_read_info_single() {
683 let info = ReadInfo::single(1, 500, 0, Arc::from("lib1"), None);
684 assert_eq!(info.ref_index1, 1);
685 assert_eq!(info.start1, 500);
686 assert_eq!(info.strand1, 0);
687 assert_eq!(info.ref_index2, ReadInfo::UNKNOWN_REF);
688 }
689
690 #[test]
691 fn test_strand_to_byte() {
692 assert_eq!(ReadInfo::strand_to_byte(true), 0);
693 assert_eq!(ReadInfo::strand_to_byte(false), 1);
694 }
695
696 #[test]
697 fn test_read_info_comparison() {
698 let info1 = ReadInfo::new(0, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
699 let info2 = ReadInfo::new(0, 150, 0, 0, 200, 1, Arc::from("lib1"), None);
700
701 assert!(info1 < info2);
702 }
703
704 #[test]
705 fn test_different_chromosomes() {
706 // R1 on chr0, R2 on chr1 - chr0 is earlier, so R1 stays first
707 let info1 = ReadInfo::new(0, 100, 0, 1, 200, 1, Arc::from("lib1"), None);
708 // R1 on chr1, R2 on chr0 - chr0 is earlier, so R2 becomes first after normalization
709 let info2 = ReadInfo::new(1, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
710
711 // Normalization: lower ref_index comes first
712 // info1: chr0 < chr1, so keeps original (ref_index1=0)
713 assert_eq!(info1.ref_index1, 0);
714 assert_eq!(info1.start1, 100);
715
716 // info2: chr1 > chr0, so swaps to put chr0 first (ref_index1=0)
717 assert_eq!(info2.ref_index1, 0);
718 assert_eq!(info2.start1, 200); // This was R2's position
719
720 // These are different ReadInfo values (different positions within each slot)
721 assert_ne!(info1, info2);
722 }
723
724 #[test]
725 fn test_build_library_lookup() {
726 use bstr::BString;
727 use noodles::sam::header::record::value::Map;
728 use noodles::sam::header::record::value::map::ReadGroup;
729 use noodles::sam::header::record::value::map::read_group::tag as rg_tag;
730
731 // Build a header with two read groups having different libraries
732 let mut header = Header::builder();
733
734 // First read group with library "libA"
735 let rg1 = Map::<ReadGroup>::builder()
736 .insert(rg_tag::LIBRARY, String::from("libA"))
737 .build()
738 .unwrap();
739 header = header.add_read_group(BString::from("RG1"), rg1);
740
741 // Second read group with library "libB"
742 let rg2 = Map::<ReadGroup>::builder()
743 .insert(rg_tag::LIBRARY, String::from("libB"))
744 .build()
745 .unwrap();
746 header = header.add_read_group(BString::from("RG2"), rg2);
747
748 // Third read group with no library (should default to "unknown")
749 let rg3 = Map::<ReadGroup>::builder().build().unwrap();
750 header = header.add_read_group(BString::from("RG3"), rg3);
751
752 let header = header.build();
753 let lookup = super::build_library_lookup(&header);
754
755 // Verify the lookup - compare Arc<str> values
756 assert_eq!(lookup.len(), 3);
757 assert_eq!(lookup.get("RG1").map(AsRef::as_ref), Some("libA"));
758 assert_eq!(lookup.get("RG2").map(AsRef::as_ref), Some("libB"));
759 assert_eq!(lookup.get("RG3").map(AsRef::as_ref), Some("unknown"));
760 }
761
762 #[test]
763 fn test_single_uses_unknown_values() {
764 // Single-end reads should use UNKNOWN values for R2 (matching fgbio)
765 let info = ReadInfo::single(1, 500, 0, Arc::from("lib1"), None);
766 assert_eq!(info.ref_index1, 1);
767 assert_eq!(info.start1, 500);
768 assert_eq!(info.strand1, 0);
769 // R2 should have UNKNOWN values (i32::MAX, i32::MAX, u8::MAX)
770 assert_eq!(info.ref_index2, i32::MAX);
771 assert_eq!(info.start2, i32::MAX);
772 assert_eq!(info.strand2, u8::MAX);
773 }
774
775 #[test]
776 fn test_single_sorts_after_paired() {
777 // Single-end reads should sort after paired-end reads because they have UNKNOWN (MAX) values
778 let paired = ReadInfo::new(0, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
779 let single = ReadInfo::single(0, 100, 0, Arc::from("lib1"), None);
780
781 // Single should be greater than paired (sorts after)
782 assert!(single > paired);
783 }
784
785 #[test]
786 fn test_ord_includes_cell_barcode() {
787 // Two ReadInfos that differ only in cell_barcode should compare differently
788 let lib: Arc<str> = Arc::from("lib1");
789 let info1 = ReadInfo::new(0, 100, 0, 0, 200, 1, lib.clone(), Some("cell1".to_string()));
790 let info2 = ReadInfo::new(0, 100, 0, 0, 200, 1, lib.clone(), Some("cell2".to_string()));
791 let info3 = ReadInfo::new(0, 100, 0, 0, 200, 1, lib, None);
792
793 // cell1 < cell2 (alphabetically)
794 assert!(info1 < info2);
795 // None < Some(_) in Rust's Option ordering
796 assert!(info3 < info1);
797 // All three are different
798 assert_ne!(info1, info2);
799 assert_ne!(info1, info3);
800 }
801
802 #[test]
803 fn test_normalization_with_strand_tiebreaker() {
804 // Same ref and position, but different strands - should use strand as tiebreaker
805 // Forward (0) is "earlier" than reverse (1)
806 let info1 = ReadInfo::new(0, 100, 1, 0, 100, 0, Arc::from("lib1"), None);
807 // After normalization, strand 0 should come first
808 assert_eq!(info1.strand1, 0);
809 assert_eq!(info1.strand2, 1);
810 }
811
812 mod get_unclipped_position_tests {
813 use super::*;
814 use crate::sam::builder::RecordBuilder;
815 use noodles::sam::alignment::RecordBuf;
816
817 /// Helper to build a test record with specific CIGAR and strand
818 fn build_record(alignment_start: usize, cigar: &str, reverse: bool) -> RecordBuf {
819 RecordBuilder::new()
820 .sequence("ACGT") // Minimal sequence
821 .alignment_start(alignment_start)
822 .cigar(cigar)
823 .reverse_complement(reverse)
824 .build()
825 }
826
827 #[test]
828 fn test_forward_strand_soft_clip_only() {
829 // Forward strand with 5S at start: alignment_start=100, CIGAR=5S50M
830 // unclipped_start = 100 - 5 = 95
831 let record = build_record(100, "5S50M", false);
832 let pos = get_unclipped_position(&record).unwrap();
833 assert_eq!(pos, 95);
834 }
835
836 #[test]
837 fn test_forward_strand_hard_clip_only() {
838 // Forward strand with 10H at start: alignment_start=100, CIGAR=10H50M
839 // unclipped_start = 100 - 10 = 90
840 let record = build_record(100, "10H50M", false);
841 let pos = get_unclipped_position(&record).unwrap();
842 assert_eq!(pos, 90);
843 }
844
845 #[test]
846 fn test_forward_strand_hard_and_soft_clip() {
847 // Forward strand with 10H5S at start: alignment_start=100, CIGAR=10H5S50M
848 // unclipped_start = 100 - 10 - 5 = 85
849 let record = build_record(100, "10H5S50M", false);
850 let pos = get_unclipped_position(&record).unwrap();
851 assert_eq!(pos, 85);
852 }
853
854 #[test]
855 fn test_reverse_strand_soft_clip_only() {
856 // Reverse strand with 5S at end: alignment_start=100, CIGAR=50M5S
857 // alignment_end = 100 + 50 - 1 = 149
858 // unclipped_end = 149 + 5 = 154
859 let record = build_record(100, "50M5S", true);
860 let pos = get_unclipped_position(&record).unwrap();
861 assert_eq!(pos, 154);
862 }
863
864 #[test]
865 fn test_reverse_strand_hard_clip_only() {
866 // Reverse strand with 10H at end: alignment_start=100, CIGAR=50M10H
867 // alignment_end = 100 + 50 - 1 = 149
868 // unclipped_end = 149 + 10 = 159
869 let record = build_record(100, "50M10H", true);
870 let pos = get_unclipped_position(&record).unwrap();
871 assert_eq!(pos, 159);
872 }
873
874 #[test]
875 fn test_reverse_strand_hard_and_soft_clip() {
876 // Reverse strand with 5S10H at end: alignment_start=100, CIGAR=50M5S10H
877 // alignment_end = 100 + 50 - 1 = 149
878 // unclipped_end = 149 + 5 + 10 = 164
879 let record = build_record(100, "50M5S10H", true);
880 let pos = get_unclipped_position(&record).unwrap();
881 assert_eq!(pos, 164);
882 }
883
884 #[test]
885 fn test_no_clipping() {
886 // No clipping: alignment_start=100, CIGAR=50M
887 // Forward: unclipped_start = 100
888 let forward_record = build_record(100, "50M", false);
889 assert_eq!(get_unclipped_position(&forward_record).unwrap(), 100);
890
891 // Reverse: unclipped_end = 100 + 50 - 1 = 149
892 let reverse_record = build_record(100, "50M", true);
893 assert_eq!(get_unclipped_position(&reverse_record).unwrap(), 149);
894 }
895
896 #[test]
897 fn test_unmapped_read() {
898 let record = RecordBuilder::new().sequence("ACGT").unmapped(true).build();
899 assert_eq!(get_unclipped_position(&record).unwrap(), 0);
900 }
901
902 #[test]
903 fn test_complex_cigar_with_insertions_deletions() {
904 // Complex CIGAR: 5H3S10M2I5M3D10M4S6H
905 // Forward strand: alignment_start=100
906 // Leading clips = 5H + 3S = 8
907 // unclipped_start = 100 - 8 = 92
908 let forward_record = build_record(100, "5H3S10M2I5M3D10M4S6H", false);
909 assert_eq!(get_unclipped_position(&forward_record).unwrap(), 92);
910
911 // Reverse strand: same CIGAR
912 // alignment_span = 10 + 5 + 3 + 10 = 28 (M + D consume reference)
913 // alignment_end = 100 + 28 - 1 = 127
914 // Trailing clips = 4S + 6H = 10
915 // unclipped_end = 127 + 10 = 137
916 let reverse_record = build_record(100, "5H3S10M2I5M3D10M4S6H", true);
917 assert_eq!(get_unclipped_position(&reverse_record).unwrap(), 137);
918 }
919 }
920}