Skip to main content

fgumi_lib/sort/
keys.rs

1//! Sort key types for BAM record sorting.
2//!
3//! This module provides lightweight sort keys that can be extracted from BAM records
4//! with minimal parsing overhead. Keys are designed for fast comparison and minimal
5//! memory footprint.
6//!
7//! # Key Types
8//!
9//! - [`CoordinateKey`]: Standard genomic coordinate (tid, pos, strand)
10//! - [`QuerynameKey`]: Read name with natural numeric ordering
11//! - [`TemplateKey`](super::inline_buffer::TemplateKey): Template-level position for UMI grouping
12//!
13//! # Generic Sorting Abstraction
14//!
15//! The [`RawSortKey`] trait provides a unified interface for sort keys that can be:
16//! - Extracted directly from raw BAM bytes (zero-copy)
17//! - Serialized/deserialized for temp file storage
18//! - Compared efficiently (O(1) for fixed-size keys)
19//!
20//! This design is inspired by:
21//! - fgbio's `SamOrder` trait (Scala)
22//! - samtools' `bam1_tag` union (C)
23
24use anyhow::Result;
25use noodles::sam::Header;
26use noodles::sam::alignment::record::data::field::Tag;
27use noodles::sam::alignment::record_buf::RecordBuf;
28use std::cmp::Ordering;
29
30use crate::sort::bam_fields;
31use std::io::{Read, Write};
32
33/// The PA (Primary Alignment) tag for secondary/supplementary reads.
34/// Stores template-coordinate sort key as B:i array (6 int32s).
35/// Added by `fgumi zipper` to enable proper template-coordinate sorting.
36pub const PA_TAG: Tag = Tag::new(b'p', b'a');
37
38// ============================================================================
39// Primary Alignment Info (PA tag)
40// ============================================================================
41
42/// Primary alignment info for secondary/supplementary reads.
43///
44/// Stores the template-coordinate sort key from the primary alignments,
45/// enabling secondary/supplementary reads to sort adjacent to their primaries.
46///
47/// # Binary Format
48///
49/// Stored as a `B:i` (int32 array) BAM tag with 6 elements:
50/// `[tid1, pos1, neg1, tid2, pos2, neg2]`
51/// where `neg1`/`neg2` are 0 for forward, 1 for reverse strand.
52///
53/// This format is faster to parse than a string representation and ensures
54/// supplementary reads get the exact same sort key as their primary reads.
55#[derive(Debug, Clone, Copy, PartialEq, Eq)]
56pub struct PrimaryAlignmentInfo {
57    /// Reference ID of the earlier mate (lower position).
58    pub tid1: i32,
59    /// Unclipped 5' position of the earlier mate.
60    pub pos1: i32,
61    /// True if earlier mate is on reverse strand.
62    pub neg1: bool,
63    /// Reference ID of the later mate.
64    pub tid2: i32,
65    /// Unclipped 5' position of the later mate.
66    pub pos2: i32,
67    /// True if later mate is on reverse strand.
68    pub neg2: bool,
69}
70
71impl PrimaryAlignmentInfo {
72    /// Creates a new `PrimaryAlignmentInfo`.
73    #[must_use]
74    pub const fn new(tid1: i32, pos1: i32, neg1: bool, tid2: i32, pos2: i32, neg2: bool) -> Self {
75        Self { tid1, pos1, neg1, tid2, pos2, neg2 }
76    }
77
78    /// Serializes to a BAM tag value (B:i array with 6 elements).
79    #[must_use]
80    pub fn to_tag_value(&self) -> noodles::sam::alignment::record_buf::data::field::Value {
81        use noodles::sam::alignment::record_buf::data::field::Value;
82        use noodles::sam::alignment::record_buf::data::field::value::Array;
83
84        let values: Vec<i32> = vec![
85            self.tid1,
86            self.pos1,
87            i32::from(self.neg1),
88            self.tid2,
89            self.pos2,
90            i32::from(self.neg2),
91        ];
92        Value::Array(Array::Int32(values))
93    }
94
95    /// Deserializes from a BAM tag value (B:i array with 6 int32 elements).
96    ///
97    /// Optimized with a fast path for Int32 arrays (the expected format) that
98    /// avoids heap allocation by directly indexing the array.
99    #[must_use]
100    pub fn from_tag_value(
101        value: &noodles::sam::alignment::record_buf::data::field::Value,
102    ) -> Option<Self> {
103        use noodles::sam::alignment::record_buf::data::field::Value;
104        use noodles::sam::alignment::record_buf::data::field::value::Array;
105
106        match value {
107            Value::Array(arr) => {
108                // Fast path: Int32 array (expected format from zipper) - no allocation
109                if let Array::Int32(v) = arr {
110                    if v.len() == 6 {
111                        return Some(Self {
112                            tid1: v[0],
113                            pos1: v[1],
114                            neg1: v[2] != 0,
115                            tid2: v[3],
116                            pos2: v[4],
117                            neg2: v[5] != 0,
118                        });
119                    }
120                    return None;
121                }
122
123                // Slow path: other array types (rare) - requires allocation
124                let values: Vec<i32> = match arr {
125                    Array::Int8(v) => v.iter().map(|&x| i32::from(x)).collect(),
126                    Array::UInt8(v) => v.iter().map(|&x| i32::from(x)).collect(),
127                    Array::Int16(v) => v.iter().map(|&x| i32::from(x)).collect(),
128                    Array::UInt16(v) => v.iter().map(|&x| i32::from(x)).collect(),
129                    Array::Int32(_) => unreachable!(), // Handled above
130                    Array::UInt32(v) => {
131                        // Use try_from to avoid wrapping for values > i32::MAX
132                        let result: Result<Vec<i32>, _> =
133                            v.iter().map(|&x| i32::try_from(x)).collect();
134                        match result {
135                            Ok(vals) => vals,
136                            Err(_) => return None,
137                        }
138                    }
139                    Array::Float(_) => return None,
140                };
141
142                if values.len() != 6 {
143                    return None;
144                }
145
146                Some(Self {
147                    tid1: values[0],
148                    pos1: values[1],
149                    neg1: values[2] != 0,
150                    tid2: values[3],
151                    pos2: values[4],
152                    neg2: values[5] != 0,
153                })
154            }
155            _ => None,
156        }
157    }
158
159    /// Extracts from a BAM record's PA tag.
160    #[must_use]
161    pub fn from_record(record: &RecordBuf) -> Option<Self> {
162        let value = record.data().get(&PA_TAG)?;
163        Self::from_tag_value(value)
164    }
165}
166
167// ============================================================================
168// Generic Sorting Abstraction (Trait-based, inspired by fgbio/samtools)
169// ============================================================================
170
171/// Trait for sort keys extracted from raw BAM bytes.
172///
173/// This trait provides a unified interface for all sort key types, enabling:
174/// - Zero-copy key extraction from raw BAM bytes
175/// - Efficient serialization for temp file storage
176/// - O(1) comparisons during merge phase (for fixed-size keys)
177///
178/// # Design
179///
180/// Inspired by fgbio's `SamOrder` trait and samtools' `bam1_tag` approach.
181/// Using a trait with monomorphization (`sort_with_keyed<K>`) gives:
182/// - Zero-cost abstraction (no runtime dispatch)
183/// - Type safety (can't mix key types)
184/// - Compile-time optimization (inlining, etc.)
185pub trait RawSortKey: Ord + Clone + Send + Sync + Sized {
186    /// Fixed byte size when serialized, or `None` for variable-length keys.
187    ///
188    /// Fixed-size keys enable O(1) reads from temp files during merge.
189    const SERIALIZED_SIZE: Option<usize>;
190
191    /// Extract a sort key from raw BAM record bytes.
192    ///
193    /// This is the hot path during sorting - implementations should minimize
194    /// parsing overhead by reading only the fields needed for comparison.
195    fn extract(bam: &[u8], ctx: &SortContext) -> Self;
196
197    /// Serialize the key to a writer for temp file storage.
198    ///
199    /// Format should be compact and enable fast deserialization.
200    ///
201    /// # Errors
202    ///
203    /// Returns an error if writing to the writer fails.
204    fn write_to<W: Write>(&self, writer: &mut W) -> std::io::Result<()>;
205
206    /// Deserialize a key from a reader.
207    ///
208    /// Must be the inverse of `write_to`.
209    ///
210    /// # Errors
211    ///
212    /// Returns an error if reading from the reader fails or data is invalid.
213    fn read_from<R: Read>(reader: &mut R) -> std::io::Result<Self>;
214}
215
216/// Context needed for sort key extraction (built from BAM header).
217///
218/// This struct holds header-derived information needed by some sort orders:
219/// - `nref`: Number of reference sequences (for coordinate sort unmapped handling)
220/// - `lib_lookup`: Library name ordinals (for template-coordinate sort)
221#[derive(Clone)]
222pub struct SortContext {
223    /// Number of reference sequences (unmapped reads map to nref).
224    pub nref: u32,
225    // Note: LibraryLookup is in raw.rs - we'll use a callback instead
226}
227
228impl SortContext {
229    /// Create a sort context from a BAM header.
230    #[must_use]
231    #[allow(clippy::cast_possible_truncation)]
232    pub fn from_header(header: &Header) -> Self {
233        Self { nref: header.reference_sequences().len() as u32 }
234    }
235
236    /// Create a context with explicit nref (for testing).
237    #[must_use]
238    pub fn new(nref: u32) -> Self {
239        Self { nref }
240    }
241}
242
243/// Sort order enumeration.
244#[derive(Debug, Clone, Copy, PartialEq, Eq)]
245pub enum SortOrder {
246    /// Coordinate sort: tid → pos → reverse strand
247    Coordinate,
248    /// Queryname sort: read name with natural ordering
249    Queryname,
250    /// Template-coordinate sort: template position for UMI grouping
251    TemplateCoordinate,
252}
253
254impl SortOrder {
255    /// Get the SAM header sort order tag value.
256    #[must_use]
257    pub fn header_so_tag(&self) -> &'static str {
258        match self {
259            Self::Coordinate => "coordinate",
260            Self::Queryname => "queryname",
261            Self::TemplateCoordinate => "unsorted",
262        }
263    }
264
265    /// Get the SAM header group order tag value.
266    #[must_use]
267    pub fn header_go_tag(&self) -> Option<&'static str> {
268        match self {
269            Self::Coordinate | Self::Queryname => None,
270            Self::TemplateCoordinate => Some("query"),
271        }
272    }
273
274    /// Get the SAM header sub-sort tag value.
275    #[must_use]
276    pub fn header_ss_tag(&self) -> Option<&'static str> {
277        match self {
278            Self::Coordinate | Self::Queryname => None,
279            Self::TemplateCoordinate => Some("template-coordinate"),
280        }
281    }
282}
283
284/// Trait for sort keys that can be extracted from BAM records.
285///
286/// The `Context` associated type allows sort keys to pre-compute data from the header
287/// once (e.g., library index mapping) and reuse it for each record extraction.
288pub trait SortKey: Ord + Clone + Send + Sync {
289    /// Context built once from header (e.g., `LibraryIndex` for template-coordinate).
290    /// Use `()` for keys that don't need context.
291    type Context: Clone + Send + Sync;
292
293    /// Build context from header (called once at sort start).
294    fn build_context(header: &Header) -> Self::Context;
295
296    /// Extract a sort key from a BAM record using pre-built context.
297    ///
298    /// # Errors
299    ///
300    /// Returns an error if key extraction fails due to missing or invalid fields.
301    fn from_record(record: &RecordBuf, header: &Header, ctx: &Self::Context) -> Result<Self>;
302}
303
304// ============================================================================
305// Coordinate Sort Key
306// ============================================================================
307
308/// Sort key for coordinate ordering.
309///
310/// Sort order: reference ID → position → reverse strand flag.
311/// Unmapped reads (tid = -1) are sorted to the end.
312///
313/// Note: No read name tie-breaking is used, matching samtools behavior.
314/// Equal records maintain their original input order (stable sort).
315#[derive(Clone, PartialEq, Eq, Debug)]
316pub struct CoordinateKey {
317    /// Reference sequence ID (tid), or `i32::MAX` for unmapped.
318    pub tid: i32,
319    /// 0-based alignment start position.
320    pub pos: i64,
321    /// True if reverse strand.
322    pub reverse: bool,
323}
324
325impl CoordinateKey {
326    /// Create a coordinate key for an unmapped read.
327    #[must_use]
328    pub fn unmapped() -> Self {
329        Self { tid: i32::MAX, pos: i64::MAX, reverse: false }
330    }
331}
332
333impl Ord for CoordinateKey {
334    fn cmp(&self, other: &Self) -> Ordering {
335        self.tid
336            .cmp(&other.tid)
337            .then_with(|| self.pos.cmp(&other.pos))
338            .then_with(|| self.reverse.cmp(&other.reverse))
339    }
340}
341
342impl PartialOrd for CoordinateKey {
343    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
344        Some(self.cmp(other))
345    }
346}
347
348impl SortKey for CoordinateKey {
349    type Context = ();
350
351    fn build_context(_header: &Header) -> Self::Context {}
352
353    fn from_record(record: &RecordBuf, _header: &Header, _ctx: &Self::Context) -> Result<Self> {
354        if record.flags().is_unmapped() {
355            return Ok(Self::unmapped());
356        }
357
358        #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
359        let tid = record.reference_sequence_id().map_or(-1, |id| id as i32);
360
361        #[allow(clippy::cast_possible_wrap)]
362        let pos = record.alignment_start().map_or(0, |p| usize::from(p) as i64);
363
364        let reverse = record.flags().is_reverse_complemented();
365
366        Ok(Self { tid, pos, reverse })
367    }
368}
369
370// ============================================================================
371// Raw Coordinate Sort Key (Fixed-size for RawSortKey trait)
372// ============================================================================
373
374/// Fixed-size coordinate sort key for raw BAM sorting (8 bytes).
375///
376/// This key is designed for efficient temp file storage and O(1) comparisons
377/// during merge phase. It packs:
378/// - `sort_key`: (tid << 34) | ((pos+1) << 1) | reverse
379///
380/// Note: No read name tie-breaking is used, matching samtools behavior.
381/// Equal records maintain their original input order (stable sort).
382#[repr(C)]
383#[derive(Copy, Clone, Eq, PartialEq, Debug, bytemuck::Pod, bytemuck::Zeroable)]
384pub struct RawCoordinateKey {
385    /// Packed primary sort key: (tid << 34) | ((pos+1) << 1) | reverse.
386    pub sort_key: u64,
387}
388
389impl RawCoordinateKey {
390    /// Size in bytes when serialized.
391    pub const SIZE: usize = 8;
392
393    /// Create a new coordinate key from components.
394    ///
395    /// # Arguments
396    /// * `tid` - Reference sequence ID (-1 for unmapped)
397    /// * `pos` - 0-based alignment position
398    /// * `reverse` - True if reverse complemented
399    /// * `nref` - Number of reference sequences (for unmapped handling)
400    #[inline]
401    #[must_use]
402    #[allow(clippy::cast_sign_loss)]
403    pub fn new(tid: i32, pos: i32, reverse: bool, nref: u32) -> Self {
404        // Map unmapped (tid=-1) to nref for proper sorting (after all mapped)
405        let tid = if tid < 0 { nref } else { tid as u32 };
406        // Pack: tid in high bits, (pos+1) in middle, reverse in LSB
407        let key = (u64::from(tid) << 34)
408            | ((i64::from(pos) as u64).wrapping_add(1) << 1)
409            | u64::from(reverse);
410        Self { sort_key: key }
411    }
412
413    /// Create a key for unmapped records (sorts after all mapped).
414    #[inline]
415    #[must_use]
416    pub fn unmapped() -> Self {
417        Self { sort_key: u64::MAX }
418    }
419
420    /// Create a zeroed key (for memory operations).
421    #[inline]
422    #[must_use]
423    pub fn zeroed() -> Self {
424        Self { sort_key: 0 }
425    }
426}
427
428impl Default for RawCoordinateKey {
429    fn default() -> Self {
430        Self::zeroed()
431    }
432}
433
434impl Ord for RawCoordinateKey {
435    #[inline]
436    fn cmp(&self, other: &Self) -> Ordering {
437        self.sort_key.cmp(&other.sort_key)
438    }
439}
440
441impl PartialOrd for RawCoordinateKey {
442    #[inline]
443    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
444        Some(self.cmp(other))
445    }
446}
447
448impl RawSortKey for RawCoordinateKey {
449    const SERIALIZED_SIZE: Option<usize> = Some(Self::SIZE);
450
451    #[inline]
452    fn extract(bam: &[u8], ctx: &SortContext) -> Self {
453        let tid = bam_fields::ref_id(bam);
454        let pos = bam_fields::pos(bam);
455        let reverse = (bam_fields::flags(bam) & bam_fields::flags::REVERSE) != 0;
456
457        // Create key based on tid (samtools behavior):
458        // - tid >= 0: sort by (tid, pos, reverse) even if unmapped flag is set
459        // - tid < 0: unmapped with no reference, sort at end
460        if tid < 0 { Self::unmapped() } else { Self::new(tid, pos, reverse, ctx.nref) }
461    }
462
463    #[inline]
464    fn write_to<W: Write>(&self, writer: &mut W) -> std::io::Result<()> {
465        writer.write_all(&self.sort_key.to_le_bytes())
466    }
467
468    #[inline]
469    fn read_from<R: Read>(reader: &mut R) -> std::io::Result<Self> {
470        let mut buf = [0u8; 8];
471        reader.read_exact(&mut buf)?;
472        Ok(Self { sort_key: u64::from_le_bytes(buf) })
473    }
474}
475
476// ============================================================================
477// Queryname Sort Key
478// ============================================================================
479
480/// Transform flags for queryname sort ordering.
481///
482/// Matches samtools' queryname sort order (`bam_sort.c` lines 245-248):
483/// ```c
484/// // Sort order is READ1, READ2, (PRIMARY), SUPPLEMENTARY, SECONDARY
485/// fa = ((fa&0xc0)<<8)|((fa&0x100)<<3)|((fa&0x800)>>3);
486/// ```
487///
488/// This transforms the relevant flag bits into a value that sorts correctly:
489/// - R1 (0x40) and R2 (0x80) bits are shifted to high positions (bits 14-15)
490/// - SECONDARY (0x100) is shifted to middle (bit 11)
491/// - SUPPLEMENTARY (0x800) is shifted to low position (bit 8)
492///
493/// The resulting sort order is:
494/// 1. NONE (unpaired) - 0x0000
495/// 2. R1 PRIMARY      - 0x4000
496/// 3. R1 SUPPLEMENTARY - 0x4100
497/// 4. R1 SECONDARY    - 0x4800
498/// 5. R2 PRIMARY      - 0x8000
499/// 6. R2 SUPPLEMENTARY - 0x8100
500/// 7. R2 SECONDARY    - 0x8800
501#[inline]
502#[must_use]
503pub const fn queryname_flag_order(flags: u16) -> u16 {
504    ((flags & 0xc0) << 8) | ((flags & 0x100) << 3) | ((flags & 0x800) >> 3)
505}
506
507/// Sort key for queryname ordering.
508///
509/// Uses natural string ordering where numeric runs are compared numerically.
510/// Example: "read1" < "read2" < "read10" < "read11"
511#[derive(Clone, PartialEq, Eq, Debug)]
512pub struct QuerynameKey {
513    /// Read name bytes.
514    pub name: Vec<u8>,
515    /// Read pair flags for ordering R1 before R2.
516    pub flags: u16,
517}
518
519impl Ord for QuerynameKey {
520    fn cmp(&self, other: &Self) -> Ordering {
521        natural_compare(&self.name, &other.name).then_with(|| self.flags.cmp(&other.flags))
522    }
523}
524
525impl PartialOrd for QuerynameKey {
526    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
527        Some(self.cmp(other))
528    }
529}
530
531impl SortKey for QuerynameKey {
532    type Context = ();
533
534    fn build_context(_header: &Header) -> Self::Context {}
535
536    fn from_record(record: &RecordBuf, _header: &Header, _ctx: &Self::Context) -> Result<Self> {
537        let name =
538            record.name().map_or_else(Vec::new, |n| Vec::from(<_ as AsRef<[u8]>>::as_ref(n)));
539        let flags = queryname_flag_order(u16::from(record.flags()));
540        Ok(Self { name, flags })
541    }
542}
543
544/// Natural string comparison that handles numeric runs.
545///
546/// Compares strings such that "read1" < "read2" < "read10".
547fn natural_compare(a: &[u8], b: &[u8]) -> Ordering {
548    let mut i = 0;
549    let mut j = 0;
550
551    while i < a.len() && j < b.len() {
552        let a_digit = a[i].is_ascii_digit();
553        let b_digit = b[j].is_ascii_digit();
554
555        match (a_digit, b_digit) {
556            (true, true) => {
557                // Both are digits - compare numerically
558                let (a_num, a_end) = parse_number(&a[i..]);
559                let (b_num, b_end) = parse_number(&b[j..]);
560
561                match a_num.cmp(&b_num) {
562                    Ordering::Equal => {
563                        i += a_end;
564                        j += b_end;
565                    }
566                    ord => return ord,
567                }
568            }
569            (true, false) => return Ordering::Less, // Digits before non-digits
570            (false, true) => return Ordering::Greater,
571            (false, false) => {
572                // Both non-digits - compare bytes
573                match a[i].cmp(&b[j]) {
574                    Ordering::Equal => {
575                        i += 1;
576                        j += 1;
577                    }
578                    ord => return ord,
579                }
580            }
581        }
582    }
583
584    // Shorter string sorts first if one is prefix of other
585    a.len().cmp(&b.len())
586}
587
588/// Parse a numeric run from the start of a byte slice.
589/// Returns (number, bytes consumed).
590fn parse_number(bytes: &[u8]) -> (u64, usize) {
591    let mut num: u64 = 0;
592    let mut i = 0;
593
594    while i < bytes.len() && bytes[i].is_ascii_digit() {
595        num = num.saturating_mul(10).saturating_add(u64::from(bytes[i] - b'0'));
596        i += 1;
597    }
598
599    (num, i)
600}
601
602// ============================================================================
603// Raw Queryname Sort Key (Variable-size for RawSortKey trait)
604// ============================================================================
605
606/// Variable-size queryname sort key for raw BAM sorting.
607///
608/// This key stores the full read name for correct natural ordering.
609/// Unlike fixed-size keys, serialization size depends on name length.
610///
611/// Serialization format: `[name_len: u16][name: bytes][flags: u16]`
612#[derive(Clone, Eq, PartialEq, Debug, Default)]
613pub struct RawQuerynameKey {
614    /// Read name bytes.
615    pub name: Vec<u8>,
616    /// Flags for segment ordering (R1 before R2).
617    pub flags: u16,
618}
619
620impl RawQuerynameKey {
621    /// Create a new queryname key.
622    #[must_use]
623    pub fn new(name: Vec<u8>, flags: u16) -> Self {
624        Self { name, flags }
625    }
626}
627
628impl Ord for RawQuerynameKey {
629    #[inline]
630    fn cmp(&self, other: &Self) -> Ordering {
631        natural_compare(&self.name, &other.name).then_with(|| self.flags.cmp(&other.flags))
632    }
633}
634
635impl PartialOrd for RawQuerynameKey {
636    #[inline]
637    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
638        Some(self.cmp(other))
639    }
640}
641
642impl RawSortKey for RawQuerynameKey {
643    const SERIALIZED_SIZE: Option<usize> = None; // Variable-length
644
645    #[inline]
646    fn extract(bam: &[u8], _ctx: &SortContext) -> Self {
647        // BAM format offsets:
648        // 8: l_read_name (u8)
649        // 14-15: flags (u16)
650        // 32+: read name (null-terminated)
651
652        let name_len = (bam[8] as usize).saturating_sub(1); // Exclude null terminator
653        let name = if name_len > 0 && 32 + name_len <= bam.len() {
654            bam[32..32 + name_len].to_vec()
655        } else {
656            Vec::new()
657        };
658        let raw_flags = u16::from_le_bytes([bam[14], bam[15]]);
659        let flags = queryname_flag_order(raw_flags);
660
661        Self { name, flags }
662    }
663
664    #[inline]
665    fn write_to<W: Write>(&self, writer: &mut W) -> std::io::Result<()> {
666        // Format: [name_len: u16][name: bytes][flags: u16]
667        let name_len = u16::try_from(self.name.len()).map_err(|_| {
668            std::io::Error::new(std::io::ErrorKind::InvalidInput, "queryname too long for u16")
669        })?;
670        writer.write_all(&name_len.to_le_bytes())?;
671        writer.write_all(&self.name)?;
672        writer.write_all(&self.flags.to_le_bytes())
673    }
674
675    #[inline]
676    fn read_from<R: Read>(reader: &mut R) -> std::io::Result<Self> {
677        // Read name length
678        let mut len_buf = [0u8; 2];
679        reader.read_exact(&mut len_buf)?;
680        let name_len = u16::from_le_bytes(len_buf) as usize;
681
682        // Read name
683        let mut name = vec![0u8; name_len];
684        reader.read_exact(&mut name)?;
685
686        // Read flags
687        let mut flags_buf = [0u8; 2];
688        reader.read_exact(&mut flags_buf)?;
689        let flags = u16::from_le_bytes(flags_buf);
690
691        Ok(Self { name, flags })
692    }
693}
694
695// ============================================================================
696// Tests
697// ============================================================================
698
699#[cfg(test)]
700mod tests {
701    use super::*;
702
703    #[test]
704    fn test_natural_compare_basic() {
705        assert_eq!(natural_compare(b"a", b"b"), Ordering::Less);
706        assert_eq!(natural_compare(b"b", b"a"), Ordering::Greater);
707        assert_eq!(natural_compare(b"a", b"a"), Ordering::Equal);
708    }
709
710    #[test]
711    fn test_natural_compare_numeric() {
712        assert_eq!(natural_compare(b"read1", b"read2"), Ordering::Less);
713        assert_eq!(natural_compare(b"read2", b"read10"), Ordering::Less);
714        assert_eq!(natural_compare(b"read10", b"read11"), Ordering::Less);
715        assert_eq!(natural_compare(b"read9", b"read10"), Ordering::Less);
716    }
717
718    #[test]
719    fn test_natural_compare_mixed() {
720        assert_eq!(natural_compare(b"a1b2", b"a1b10"), Ordering::Less);
721        assert_eq!(natural_compare(b"x10y20", b"x10y3"), Ordering::Greater);
722    }
723
724    #[test]
725    fn test_natural_compare_empty() {
726        assert_eq!(natural_compare(b"", b""), Ordering::Equal);
727        assert_eq!(natural_compare(b"", b"a"), Ordering::Less);
728        assert_eq!(natural_compare(b"a", b""), Ordering::Greater);
729    }
730
731    #[test]
732    fn test_coordinate_key_ordering() {
733        let k1 = CoordinateKey { tid: 0, pos: 100, reverse: false };
734        let k2 = CoordinateKey { tid: 0, pos: 200, reverse: false };
735        let k3 = CoordinateKey { tid: 1, pos: 50, reverse: false };
736
737        assert!(k1 < k2);
738        assert!(k2 < k3);
739    }
740
741    #[test]
742    fn test_coordinate_key_unmapped_last() {
743        let mapped = CoordinateKey { tid: 0, pos: 100, reverse: false };
744        let unmapped = CoordinateKey::unmapped();
745
746        assert!(mapped < unmapped);
747    }
748
749    #[test]
750    fn test_primary_alignment_info_roundtrip() {
751        let info = PrimaryAlignmentInfo::new(5, 1000, false, 3, 2000, true);
752        let value = info.to_tag_value();
753        let parsed = PrimaryAlignmentInfo::from_tag_value(&value);
754
755        assert!(parsed.is_some());
756        let parsed = parsed.unwrap();
757        assert_eq!(parsed.tid1, 5);
758        assert_eq!(parsed.pos1, 1000);
759        assert!(!parsed.neg1);
760        assert_eq!(parsed.tid2, 3);
761        assert_eq!(parsed.pos2, 2000);
762        assert!(parsed.neg2);
763    }
764
765    #[test]
766    fn test_primary_alignment_info_from_record() {
767        use crate::sam::builder::RecordBuilder;
768        use noodles::sam::alignment::record::Flags;
769
770        // Create a supplementary record with pa tag
771        let info = PrimaryAlignmentInfo::new(0, 100, false, 0, 200, true);
772        let record = RecordBuilder::new()
773            .name("test")
774            .sequence("ACGT")
775            .flags(Flags::SUPPLEMENTARY)
776            .tag("pa", info.to_tag_value())
777            .build();
778
779        let result = PrimaryAlignmentInfo::from_record(&record);
780        assert!(result.is_some());
781        let result = result.unwrap();
782        assert_eq!(result.tid1, 0);
783        assert_eq!(result.pos1, 100);
784        assert!(!result.neg1);
785        assert_eq!(result.tid2, 0);
786        assert_eq!(result.pos2, 200);
787        assert!(result.neg2);
788    }
789
790    #[test]
791    fn test_primary_alignment_info_missing() {
792        use crate::sam::builder::RecordBuilder;
793        use noodles::sam::alignment::record::Flags;
794
795        let record =
796            RecordBuilder::new().name("test").sequence("ACGT").flags(Flags::SUPPLEMENTARY).build();
797
798        let result = PrimaryAlignmentInfo::from_record(&record);
799        assert!(result.is_none());
800    }
801
802    // ========================================================================
803    // from_tag_value tests (fast path coverage)
804    // ========================================================================
805
806    #[test]
807    fn test_from_tag_value_int32_fast_path() {
808        use noodles::sam::alignment::record_buf::data::field::Value;
809        use noodles::sam::alignment::record_buf::data::field::value::Array;
810
811        // Create Int32 array (the expected format from zipper)
812        let values: Vec<i32> = vec![5, 1000, 0, 3, 2000, 1];
813        let value = Value::Array(Array::Int32(values));
814
815        let result = PrimaryAlignmentInfo::from_tag_value(&value);
816        assert!(result.is_some());
817        let info = result.unwrap();
818        assert_eq!(info.tid1, 5);
819        assert_eq!(info.pos1, 1000);
820        assert!(!info.neg1);
821        assert_eq!(info.tid2, 3);
822        assert_eq!(info.pos2, 2000);
823        assert!(info.neg2);
824    }
825
826    #[test]
827    fn test_from_tag_value_int32_wrong_length() {
828        use noodles::sam::alignment::record_buf::data::field::Value;
829        use noodles::sam::alignment::record_buf::data::field::value::Array;
830
831        // Int32 array with wrong number of elements
832        let values: Vec<i32> = vec![5, 1000, 0]; // Only 3 elements
833        let value = Value::Array(Array::Int32(values));
834
835        let result = PrimaryAlignmentInfo::from_tag_value(&value);
836        assert!(result.is_none());
837    }
838
839    #[test]
840    fn test_from_tag_value_int16_fallback() {
841        use noodles::sam::alignment::record_buf::data::field::Value;
842        use noodles::sam::alignment::record_buf::data::field::value::Array;
843
844        // Int16 array (rare, but should work via fallback path)
845        let values: Vec<i16> = vec![5, 1000, 0, 3, 2000, 1];
846        let value = Value::Array(Array::Int16(values));
847
848        let result = PrimaryAlignmentInfo::from_tag_value(&value);
849        assert!(result.is_some());
850        let info = result.unwrap();
851        assert_eq!(info.tid1, 5);
852        assert_eq!(info.pos1, 1000);
853    }
854
855    #[test]
856    fn test_from_tag_value_non_array_returns_none() {
857        use noodles::sam::alignment::record_buf::data::field::Value;
858
859        // String value instead of array
860        let value = Value::String("not_an_array".into());
861
862        let result = PrimaryAlignmentInfo::from_tag_value(&value);
863        assert!(result.is_none());
864    }
865
866    #[test]
867    fn test_from_tag_value_float_array_returns_none() {
868        use noodles::sam::alignment::record_buf::data::field::Value;
869        use noodles::sam::alignment::record_buf::data::field::value::Array;
870
871        // Float array is not supported
872        let values: Vec<f32> = vec![5.0, 1000.0, 0.0, 3.0, 2000.0, 1.0];
873        let value = Value::Array(Array::Float(values));
874
875        let result = PrimaryAlignmentInfo::from_tag_value(&value);
876        assert!(result.is_none());
877    }
878
879    // ========================================================================
880    // PrimaryAlignmentInfo::new edge cases
881    // ========================================================================
882
883    #[test]
884    fn test_primary_alignment_info_new_stores_values_unchanged() {
885        // Verify new() stores values exactly as provided (no normalization)
886        let info = PrimaryAlignmentInfo::new(5, 1000, true, 3, 500, false);
887
888        // Values should be stored exactly as provided
889        assert_eq!(info.tid1, 5);
890        assert_eq!(info.pos1, 1000);
891        assert!(info.neg1);
892        assert_eq!(info.tid2, 3);
893        assert_eq!(info.pos2, 500);
894        assert!(!info.neg2);
895    }
896
897    #[test]
898    fn test_primary_alignment_info_new_with_negative_positions() {
899        // Edge case: negative positions (can happen with soft clips before position 0)
900        let info = PrimaryAlignmentInfo::new(0, -5, false, 0, -10, true);
901
902        assert_eq!(info.tid1, 0);
903        assert_eq!(info.pos1, -5);
904        assert!(!info.neg1);
905        assert_eq!(info.tid2, 0);
906        assert_eq!(info.pos2, -10);
907        assert!(info.neg2);
908    }
909
910    #[test]
911    fn test_primary_alignment_info_new_with_max_values() {
912        // Edge case: maximum i32 values
913        let info = PrimaryAlignmentInfo::new(i32::MAX, i32::MAX, true, i32::MAX, i32::MAX, true);
914
915        assert_eq!(info.tid1, i32::MAX);
916        assert_eq!(info.pos1, i32::MAX);
917        assert!(info.neg1);
918        assert_eq!(info.tid2, i32::MAX);
919        assert_eq!(info.pos2, i32::MAX);
920        assert!(info.neg2);
921    }
922
923    #[test]
924    fn test_primary_alignment_info_new_with_zero_values() {
925        // Edge case: all zeros (unmapped or start of reference)
926        let info = PrimaryAlignmentInfo::new(0, 0, false, 0, 0, false);
927
928        assert_eq!(info.tid1, 0);
929        assert_eq!(info.pos1, 0);
930        assert!(!info.neg1);
931        assert_eq!(info.tid2, 0);
932        assert_eq!(info.pos2, 0);
933        assert!(!info.neg2);
934    }
935
936    #[test]
937    fn test_primary_alignment_info_roundtrip_with_negative_positions() {
938        // Verify negative positions survive roundtrip
939        let info = PrimaryAlignmentInfo::new(0, -10, false, 0, -5, true);
940        let value = info.to_tag_value();
941        let parsed = PrimaryAlignmentInfo::from_tag_value(&value).unwrap();
942
943        assert_eq!(parsed.tid1, 0);
944        assert_eq!(parsed.pos1, -10);
945        assert!(!parsed.neg1);
946        assert_eq!(parsed.tid2, 0);
947        assert_eq!(parsed.pos2, -5);
948        assert!(parsed.neg2);
949    }
950
951    #[test]
952    fn test_primary_alignment_info_position_order_is_caller_responsibility() {
953        // Verify that new() does NOT enforce pos1 < pos2 ordering
954        // (ordering is done by the caller in zipper.rs)
955        let info = PrimaryAlignmentInfo::new(0, 2000, false, 0, 1000, false);
956
957        // Values are stored as provided, even if "out of order"
958        assert_eq!(info.pos1, 2000); // pos1 > pos2 is allowed
959        assert_eq!(info.pos2, 1000);
960    }
961
962    // ========================================================================
963    // queryname_flag_order tests
964    // ========================================================================
965
966    /// Test exact transformation values match samtools formula.
967    /// Formula: ((flags & 0xc0) << 8) | ((flags & 0x100) << 3) | ((flags & 0x800) >> 3)
968    #[test]
969    fn test_queryname_flag_order_exact_transformation_values() {
970        // Flag bits:
971        // 0x40  = READ1 (bit 6)
972        // 0x80  = READ2 (bit 7)
973        // 0x100 = SECONDARY (bit 8)
974        // 0x800 = SUPPLEMENTARY (bit 11)
975
976        // NONE (unpaired primary)
977        assert_eq!(queryname_flag_order(0x0000), 0x0000);
978
979        // R1 PRIMARY: 0x40 << 8 = 0x4000
980        assert_eq!(queryname_flag_order(0x0040), 0x4000);
981
982        // R2 PRIMARY: 0x80 << 8 = 0x8000
983        assert_eq!(queryname_flag_order(0x0080), 0x8000);
984
985        // SECONDARY only: 0x100 << 3 = 0x800
986        assert_eq!(queryname_flag_order(0x0100), 0x0800);
987
988        // SUPPLEMENTARY only: 0x800 >> 3 = 0x100
989        assert_eq!(queryname_flag_order(0x0800), 0x0100);
990
991        // R1 SUPPLEMENTARY: (0x40 << 8) | (0x800 >> 3) = 0x4000 | 0x100 = 0x4100
992        assert_eq!(queryname_flag_order(0x0840), 0x4100);
993
994        // R1 SECONDARY: (0x40 << 8) | (0x100 << 3) = 0x4000 | 0x800 = 0x4800
995        assert_eq!(queryname_flag_order(0x0140), 0x4800);
996
997        // R2 SUPPLEMENTARY: (0x80 << 8) | (0x800 >> 3) = 0x8000 | 0x100 = 0x8100
998        assert_eq!(queryname_flag_order(0x0880), 0x8100);
999
1000        // R2 SECONDARY: (0x80 << 8) | (0x100 << 3) = 0x8000 | 0x800 = 0x8800
1001        assert_eq!(queryname_flag_order(0x0180), 0x8800);
1002    }
1003
1004    /// Test the complete sort order of all 12 categories.
1005    /// Order: NONE < R1 PRIMARY < R1 SUPP < R1 SEC < R2 PRIMARY < R2 SUPP < R2 SEC
1006    ///        (and combined R1+R2 categories)
1007    #[test]
1008    fn test_queryname_flag_order_complete_sort_order() {
1009        // All 12 possible combinations sorted in expected order
1010        let flags_in_order = [
1011            // NONE (unpaired)
1012            0x0000u16, // NONE PRIMARY
1013            0x0800,    // NONE SUPPLEMENTARY
1014            0x0100,    // NONE SECONDARY
1015            // R1
1016            0x0040, // R1 PRIMARY
1017            0x0840, // R1 SUPPLEMENTARY
1018            0x0140, // R1 SECONDARY
1019            // R2
1020            0x0080, // R2 PRIMARY
1021            0x0880, // R2 SUPPLEMENTARY
1022            0x0180, // R2 SECONDARY
1023            // R1+R2 (unusual but possible in edge cases)
1024            0x00c0, // R1+R2 PRIMARY
1025            0x08c0, // R1+R2 SUPPLEMENTARY
1026            0x01c0, // R1+R2 SECONDARY
1027        ];
1028
1029        // Transform all flags
1030        let transformed: Vec<u16> =
1031            flags_in_order.iter().map(|&f| queryname_flag_order(f)).collect();
1032
1033        // Verify sorted order
1034        for i in 0..transformed.len() - 1 {
1035            assert!(
1036                transformed[i] < transformed[i + 1],
1037                "Expected flags 0x{:04x} (transformed 0x{:04x}) < 0x{:04x} (transformed 0x{:04x})",
1038                flags_in_order[i],
1039                transformed[i],
1040                flags_in_order[i + 1],
1041                transformed[i + 1]
1042            );
1043        }
1044    }
1045
1046    /// Test R1 always sorts before R2.
1047    #[test]
1048    fn test_queryname_flag_order_r1_before_r2() {
1049        // R1 variants
1050        let r1_primary = queryname_flag_order(0x0040);
1051        let r1_supp = queryname_flag_order(0x0840);
1052        let r1_sec = queryname_flag_order(0x0140);
1053
1054        // R2 variants
1055        let r2_primary = queryname_flag_order(0x0080);
1056        let r2_supp = queryname_flag_order(0x0880);
1057        let r2_sec = queryname_flag_order(0x0180);
1058
1059        // All R1 variants should be less than all R2 variants
1060        assert!(r1_primary < r2_primary);
1061        assert!(r1_primary < r2_supp);
1062        assert!(r1_primary < r2_sec);
1063        assert!(r1_supp < r2_primary);
1064        assert!(r1_supp < r2_supp);
1065        assert!(r1_supp < r2_sec);
1066        assert!(r1_sec < r2_primary);
1067        assert!(r1_sec < r2_supp);
1068        assert!(r1_sec < r2_sec);
1069    }
1070
1071    /// Test PRIMARY < SUPPLEMENTARY < SECONDARY within each read type.
1072    #[test]
1073    fn test_queryname_flag_order_primary_before_supplementary_before_secondary() {
1074        // Test for NONE (unpaired)
1075        let none_pri = queryname_flag_order(0x0000);
1076        let none_supp = queryname_flag_order(0x0800);
1077        let none_sec = queryname_flag_order(0x0100);
1078        assert!(none_pri < none_supp, "NONE: PRIMARY < SUPP");
1079        assert!(none_supp < none_sec, "NONE: SUPP < SEC");
1080
1081        // Test for R1
1082        let r1_pri = queryname_flag_order(0x0040);
1083        let r1_supp = queryname_flag_order(0x0840);
1084        let r1_sec = queryname_flag_order(0x0140);
1085        assert!(r1_pri < r1_supp, "R1: PRIMARY < SUPP");
1086        assert!(r1_supp < r1_sec, "R1: SUPP < SEC");
1087
1088        // Test for R2
1089        let r2_pri = queryname_flag_order(0x0080);
1090        let r2_supp = queryname_flag_order(0x0880);
1091        let r2_sec = queryname_flag_order(0x0180);
1092        assert!(r2_pri < r2_supp, "R2: PRIMARY < SUPP");
1093        assert!(r2_supp < r2_sec, "R2: SUPP < SEC");
1094    }
1095
1096    /// Test NONE (unpaired) sorts before R1 sorts before R2.
1097    #[test]
1098    fn test_queryname_flag_order_none_before_r1_before_r2() {
1099        let none = queryname_flag_order(0x0000);
1100        let r1 = queryname_flag_order(0x0040);
1101        let r2 = queryname_flag_order(0x0080);
1102
1103        assert!(none < r1, "NONE < R1");
1104        assert!(r1 < r2, "R1 < R2");
1105    }
1106
1107    /// Test that irrelevant flags don't affect ordering.
1108    /// Only 0x40, 0x80, 0x100, 0x800 should matter.
1109    #[test]
1110    fn test_queryname_flag_order_irrelevant_flags_do_not_affect_order() {
1111        // Irrelevant flags: PAIRED(0x1), PROPER_PAIR(0x2), UNMAPPED(0x4), MATE_UNMAPPED(0x8),
1112        // REVERSE(0x10), MATE_REVERSE(0x20), DUPLICATE(0x400), FAIL_QC(0x200)
1113        let irrelevant_flags: u16 = 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x200 | 0x400;
1114
1115        // Base case: R1 PRIMARY
1116        let r1_base = queryname_flag_order(0x0040);
1117        let r1_with_irrelevant = queryname_flag_order(0x0040 | irrelevant_flags);
1118        assert_eq!(r1_base, r1_with_irrelevant, "Irrelevant flags should not change result");
1119
1120        // R2 SUPPLEMENTARY
1121        let r2_supp_base = queryname_flag_order(0x0880);
1122        let r2_supp_with = queryname_flag_order(0x0880 | irrelevant_flags);
1123        assert_eq!(r2_supp_base, r2_supp_with);
1124
1125        // NONE SECONDARY
1126        let none_sec_base = queryname_flag_order(0x0100);
1127        let none_sec_with = queryname_flag_order(0x0100 | irrelevant_flags);
1128        assert_eq!(none_sec_base, none_sec_with);
1129    }
1130
1131    /// Test with real-world flag combinations commonly seen in BAM files.
1132    #[test]
1133    fn test_queryname_flag_order_real_world_flags() {
1134        // Common real-world flag values (from paired-end sequencing)
1135
1136        // R1 forward, properly paired: 0x63 = PAIRED|PROPER|MATE_REV|R1 = 0x1|0x2|0x20|0x40
1137        let r1_fwd = queryname_flag_order(0x0063);
1138        assert_eq!(r1_fwd, 0x4000, "R1 fwd should extract to R1 PRIMARY");
1139
1140        // R2 reverse, properly paired: 0x93 = PAIRED|PROPER|REV|R2 = 0x1|0x2|0x10|0x80
1141        let r2_rev = queryname_flag_order(0x0093);
1142        assert_eq!(r2_rev, 0x8000, "R2 rev should extract to R2 PRIMARY");
1143
1144        // R1 supplementary: 0x841 = PAIRED|R1|SUPP = 0x1|0x40|0x800
1145        let r1_supp = queryname_flag_order(0x0841);
1146        assert_eq!(r1_supp, 0x4100, "R1 supp should extract to R1 SUPP");
1147
1148        // R2 secondary: 0x181 = PAIRED|R2|SEC = 0x1|0x80|0x100
1149        let r2_sec = queryname_flag_order(0x0181);
1150        assert_eq!(r2_sec, 0x8800, "R2 sec should extract to R2 SEC");
1151
1152        // Verify ordering
1153        assert!(r1_fwd < r1_supp);
1154        assert!(r1_supp < r2_rev);
1155        assert!(r2_rev < r2_sec);
1156    }
1157
1158    /// Test from actual test data showing the bug fix.
1159    /// samtools order: 113 → 2161 → 177 (R1 primary → R1 supplementary → R2 primary)
1160    /// fgumi (before fix): 113 → 177 → 2161 (wrong - puts R2 primary before R1 supp)
1161    #[test]
1162    fn test_queryname_flag_order_from_test_data() {
1163        // Flags from actual test data:
1164        // 113 = 0x71 = PAIRED|PROPER|REV|R1 (R1 PRIMARY on reverse strand)
1165        // 177 = 0xB1 = PAIRED|PROPER|REV|R2 (R2 PRIMARY on reverse strand)
1166        // 2161 = 0x871 = PAIRED|PROPER|REV|R1|SUPP (R1 SUPPLEMENTARY)
1167
1168        let f113 = queryname_flag_order(113); // R1 PRIMARY
1169        let f177 = queryname_flag_order(177); // R2 PRIMARY
1170        let f2161 = queryname_flag_order(2161); // R1 SUPPLEMENTARY
1171
1172        // Correct order: R1 PRIMARY < R1 SUPP < R2 PRIMARY
1173        assert!(f113 < f2161, "R1 PRIMARY (113) should be < R1 SUPP (2161): {f113} vs {f2161}");
1174        assert!(f2161 < f177, "R1 SUPP (2161) should be < R2 PRIMARY (177): {f2161} vs {f177}");
1175    }
1176
1177    /// Test edge case: both R1 and R2 flags set (unusual but possible).
1178    #[test]
1179    fn test_queryname_flag_order_edge_case_both_r1_r2() {
1180        // Both R1 and R2 set: 0xc0
1181        let both = queryname_flag_order(0x00c0);
1182
1183        // Should combine: (0xc0 << 8) = 0xc000
1184        assert_eq!(both, 0xc000);
1185
1186        // Should sort after both R1-only and R2-only
1187        let r1_only = queryname_flag_order(0x0040);
1188        let r2_only = queryname_flag_order(0x0080);
1189        assert!(r1_only < both);
1190        assert!(r2_only < both);
1191    }
1192
1193    /// Test that the function is const-evaluable.
1194    #[test]
1195    fn test_queryname_flag_order_is_const() {
1196        // This compiles only if queryname_flag_order is const
1197        const R1_PRIMARY: u16 = queryname_flag_order(0x0040);
1198        const R2_PRIMARY: u16 = queryname_flag_order(0x0080);
1199
1200        assert_eq!(R1_PRIMARY, 0x4000);
1201        assert_eq!(R2_PRIMARY, 0x8000);
1202    }
1203}