Skip to main content

fgumi_lib/sort/
inline_buffer.rs

1//! Inline record buffer for samtools-style memory layout.
2//!
3//! This module provides a contiguous buffer that stores BAM records inline
4//! with pre-computed sort keys, eliminating per-record heap allocations.
5//!
6//! Key benefits over `Vec<(Key, Record)>`:
7//! - Single large allocation instead of millions of small ones
8//! - Better cache locality - sequential memory access
9//! - Sort by index array, not by moving actual record data
10//! - ~24 bytes overhead per record vs ~110 bytes
11
12use crate::sort::bam_fields;
13use crate::sort::keys::{RawSortKey, SortContext};
14use std::cmp::Ordering;
15use std::io::{Read, Write};
16
17// ============================================================================
18// Packed Sort Keys
19// ============================================================================
20
21/// Packed sort key for coordinate ordering.
22///
23/// Format: `(tid << 34) | ((pos+1) << 1) | reverse`
24///
25/// This allows single u64 comparison for most records. The +1 on pos
26/// ensures pos=0 doesn't collide with unmapped (which uses MAX).
27#[repr(transparent)]
28#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)]
29pub struct PackedCoordKey(pub u64);
30
31impl PackedCoordKey {
32    /// Create a packed coordinate key.
33    ///
34    /// # Arguments
35    /// * `tid` - Reference sequence ID (-1 for unmapped)
36    /// * `pos` - 0-based alignment position
37    /// * `reverse` - True if reverse complemented
38    /// * `nref` - Number of reference sequences (for unmapped handling)
39    #[inline]
40    #[must_use]
41    #[allow(clippy::cast_sign_loss)]
42    pub fn new(tid: i32, pos: i32, reverse: bool, nref: u32) -> Self {
43        // Map unmapped (tid=-1) to nref for proper sorting (after all mapped)
44        let tid = if tid < 0 { nref } else { tid as u32 };
45        // Pack: tid in high bits, (pos+1) in middle, reverse in LSB
46        // Using pos+1 so that pos=0 doesn't become 0 in the key
47        #[allow(clippy::cast_lossless)] // Explicit bit packing requires precise control
48        let key = (u64::from(tid) << 34)
49            | ((i64::from(pos) as u64).wrapping_add(1) << 1)
50            | u64::from(reverse);
51        Self(key)
52    }
53
54    /// Create a key for unmapped records (sorts after all mapped).
55    #[inline]
56    #[must_use]
57    pub fn unmapped() -> Self {
58        Self(u64::MAX)
59    }
60}
61
62// ============================================================================
63// Record References (Index for Sorting)
64// ============================================================================
65
66/// Reference to a record in the buffer (used for sorting).
67///
68/// This is a lightweight handle that can be sorted efficiently.
69/// The actual record data stays in place in the buffer.
70///
71/// Note: No read name tie-breaking is used, matching samtools behavior.
72/// Equal records maintain their original input order (stable sort).
73#[repr(C)]
74#[derive(Copy, Clone, Debug)]
75pub struct RecordRef {
76    /// Packed primary sort key for fast comparison.
77    pub sort_key: u64,
78    /// Offset into `RecordBuffer` where record header starts.
79    pub offset: u64,
80    /// Length of raw BAM data (excluding inline header).
81    pub len: u32,
82    /// Padding for 8-byte alignment.
83    padding: u32,
84}
85
86impl PartialEq for RecordRef {
87    fn eq(&self, other: &Self) -> bool {
88        self.sort_key == other.sort_key
89    }
90}
91
92impl Eq for RecordRef {}
93
94impl PartialOrd for RecordRef {
95    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
96        Some(self.cmp(other))
97    }
98}
99
100impl Ord for RecordRef {
101    fn cmp(&self, other: &Self) -> Ordering {
102        self.sort_key.cmp(&other.sort_key)
103    }
104}
105
106// ============================================================================
107// Inline Header (stored before each record in buffer)
108// ============================================================================
109
110/// Inline record header stored in buffer before raw BAM data.
111///
112/// This header is written once when the record is added and contains
113/// pre-computed sort keys for efficient sorting.
114#[repr(C)]
115#[derive(Copy, Clone, Debug)]
116struct InlineHeader {
117    /// Pre-computed packed sort key.
118    sort_key: u64,
119    /// Length of following raw BAM data.
120    record_len: u32,
121    /// Padding for 8-byte alignment.
122    padding: u32,
123}
124
125const HEADER_SIZE: usize = std::mem::size_of::<InlineHeader>(); // 16 bytes
126
127// ============================================================================
128// RecordBuffer - Main Data Structure
129// ============================================================================
130
131/// Contiguous buffer for inline record storage.
132///
133/// Records are stored sequentially in a single allocation:
134/// ```text
135/// [Header0][BAM0][Header1][BAM1][Header2][BAM2]...
136/// ```
137///
138/// An index array of `RecordRef` is maintained for sorting.
139/// Sorting only reorders the index; records stay in place.
140pub struct RecordBuffer {
141    /// Raw byte storage for all records (headers + BAM data).
142    data: Vec<u8>,
143    /// Index of record references for sorting.
144    refs: Vec<RecordRef>,
145    /// Number of reference sequences (for unmapped handling).
146    nref: u32,
147}
148
149impl RecordBuffer {
150    /// Create a new buffer with estimated capacity.
151    ///
152    /// # Arguments
153    /// * `estimated_records` - Expected number of records
154    /// * `estimated_bytes` - Expected total bytes of BAM data
155    /// * `nref` - Number of reference sequences in header
156    #[must_use]
157    pub fn with_capacity(estimated_records: usize, estimated_bytes: usize, nref: u32) -> Self {
158        Self {
159            data: Vec::with_capacity(estimated_bytes + estimated_records * HEADER_SIZE),
160            refs: Vec::with_capacity(estimated_records),
161            nref,
162        }
163    }
164
165    /// Push a record for coordinate sorting.
166    ///
167    /// Extracts the sort key inline from raw BAM bytes (zero-copy).
168    ///
169    /// # Panics
170    ///
171    /// Panics if the record length exceeds `u32::MAX`.
172    #[inline]
173    pub fn push_coordinate(&mut self, record: &[u8]) {
174        let offset = self.data.len() as u64;
175        let len = u32::try_from(record.len()).expect("record length exceeds u32::MAX");
176
177        // Extract sort key from raw BAM bytes
178        let sort_key = extract_coordinate_key_inline(record, self.nref);
179
180        // Write inline header (16 bytes)
181        let header = InlineHeader { sort_key, record_len: len, padding: 0 };
182
183        // Extend data with header bytes
184        self.data.extend_from_slice(&header.sort_key.to_le_bytes());
185        self.data.extend_from_slice(&header.record_len.to_le_bytes());
186        self.data.extend_from_slice(&header.padding.to_le_bytes());
187
188        // Write raw BAM data
189        self.data.extend_from_slice(record);
190
191        // Add to index
192        self.refs.push(RecordRef { sort_key, offset, len, padding: 0 });
193    }
194
195    /// Sort the index by key (records stay in place).
196    ///
197    /// Uses radix sort for O(n×k) performance instead of O(n log n) comparison sort.
198    /// Falls back to insertion sort for small arrays.
199    pub fn sort(&mut self) {
200        radix_sort_record_refs(&mut self.refs);
201    }
202
203    /// Sort using parallel radix sort (for large arrays).
204    ///
205    /// Divides data into chunks, sorts each with radix sort, then merges.
206    /// For very large arrays this is faster than single-threaded radix sort.
207    pub fn par_sort(&mut self) {
208        // For parallel sort, we use chunked radix sort
209        // Radix sort is already O(n×k), so parallelizing chunks provides
210        // linear speedup without the merge overhead of comparison-based parallel sorts
211        parallel_radix_sort_record_refs(&mut self.refs);
212    }
213
214    /// Get record bytes by reference.
215    #[inline]
216    #[must_use]
217    #[allow(clippy::cast_possible_truncation)]
218    pub fn get_record(&self, r: &RecordRef) -> &[u8] {
219        let start = r.offset as usize + HEADER_SIZE;
220        let end = start + r.len as usize;
221        &self.data[start..end]
222    }
223
224    /// Iterate over sorted records.
225    pub fn iter_sorted(&self) -> impl Iterator<Item = &[u8]> {
226        self.refs.iter().map(|r| self.get_record(r))
227    }
228
229    /// Get the sorted record references.
230    #[must_use]
231    pub fn refs(&self) -> &[RecordRef] {
232        &self.refs
233    }
234
235    /// Memory usage in bytes (actual data stored, not capacity).
236    #[must_use]
237    pub fn memory_usage(&self) -> usize {
238        self.data.len() + self.refs.len() * std::mem::size_of::<RecordRef>()
239    }
240
241    /// Number of records.
242    #[must_use]
243    pub fn len(&self) -> usize {
244        self.refs.len()
245    }
246
247    /// Check if buffer is empty.
248    #[must_use]
249    pub fn is_empty(&self) -> bool {
250        self.refs.is_empty()
251    }
252
253    /// Clear the buffer for reuse.
254    pub fn clear(&mut self) {
255        self.data.clear();
256        self.refs.clear();
257    }
258
259    /// Get the number of reference sequences.
260    #[must_use]
261    pub fn nref(&self) -> u32 {
262        self.nref
263    }
264}
265
266// ============================================================================
267// Key Extraction Functions
268// ============================================================================
269
270/// Extract coordinate sort key directly from raw BAM bytes.
271///
272/// This is a zero-copy operation that reads fields at fixed offsets.
273///
274/// For coordinate sorting (following samtools behavior):
275/// - Reads with valid tid (>= 0) are sorted by (tid, pos, reverse)
276/// - Reads with tid = -1 (no reference) sort at the end
277/// - Unmapped reads with a valid tid use that tid for sorting (typically mate's position)
278///
279/// Note: No read name tie-breaking is used, matching samtools behavior.
280/// Equal records maintain their original input order (stable sort).
281#[inline]
282#[must_use]
283pub fn extract_coordinate_key_inline(bam: &[u8], nref: u32) -> u64 {
284    let tid = bam_fields::ref_id(bam);
285    let pos = bam_fields::pos(bam);
286    let reverse = (bam_fields::flags(bam) & bam_fields::flags::REVERSE) != 0;
287
288    // Pack key based on tid (samtools behavior):
289    // - tid >= 0: sort by (tid, pos, reverse) even if unmapped flag is set
290    // - tid < 0: unmapped with no reference, sort at end
291    if tid < 0 {
292        PackedCoordKey::unmapped().0
293    } else {
294        PackedCoordKey::new(tid, pos, reverse, nref).0
295    }
296}
297
298// ============================================================================
299// Template-Coordinate Support
300// ============================================================================
301
302/// Extended key for template-coordinate sorting.
303///
304/// Template-coordinate requires comparing multiple fields:
305/// tid1, tid2, pos1, pos2, neg1, neg2, library, MI, name, `is_upper`
306///
307/// We pack these into 4 u64 values for efficient comparison.
308/// The `name_hash_upper` field packs both `name_hash` and `is_upper`:
309/// - Upper 63 bits: name hash (groups same names together)
310/// - Lowest bit: `is_upper` (false=0, true=1)
311///
312/// This ensures reads from the same template stay together (same hash),
313/// with `is_upper=false` sorting before `is_upper=true`.
314#[repr(C)]
315#[derive(Copy, Clone, Debug, Eq, PartialEq, bytemuck::Pod, bytemuck::Zeroable)]
316pub struct TemplateKey {
317    /// Packed: (tid1 << 48) | (tid2 << 32) | pos1
318    /// Comparison order matches samtools: tid1, tid2, pos1
319    pub primary: u64,
320    /// Packed: (pos2 << 32) | (!neg1 << 1) | !neg2
321    /// neg flags inverted so reverse (neg=true) sorts before forward (neg=false)
322    pub secondary: u64,
323    /// Packed: (library << 48) | (`mi_value` << 1) | `mi_suffix`
324    pub tertiary: u64,
325    /// Packed: (`name_hash` << 1) | `is_upper`
326    /// This ensures same-name records group together, with `is_upper` as tie-breaker.
327    pub name_hash_upper: u64,
328}
329
330impl TemplateKey {
331    /// Create a new template key from extracted fields.
332    #[allow(clippy::too_many_arguments, clippy::cast_sign_loss)]
333    #[must_use]
334    pub fn new(
335        tid1: i32,
336        pos1: i32,
337        neg1: bool,
338        tid2: i32,
339        pos2: i32,
340        neg2: bool,
341        library: u32,
342        mi: (u64, bool),
343        name_hash: u64,
344        is_upper: bool,
345    ) -> Self {
346        // Handle i32::MAX specially (indicates unmapped mate)
347        let tid1_packed = if tid1 == i32::MAX { 0xFFFF_u64 } else { (tid1.max(0) as u64) & 0xFFFF };
348        let tid2_packed = if tid2 == i32::MAX { 0xFFFF_u64 } else { (tid2.max(0) as u64) & 0xFFFF };
349        // Convert signed positions to unsigned preserving sort order (XOR with sign bit).
350        // This ensures negative positions sort correctly before positive ones.
351        // i32::MAX maps to 0xFFFFFFFF which sorts last (used for unmapped mate sentinel).
352        let pos1_packed = u64::from((pos1 as u32) ^ 0x8000_0000) & 0xFFFF_FFFF;
353        let pos2_packed = u64::from((pos2 as u32) ^ 0x8000_0000) & 0xFFFF_FFFF;
354
355        // Pack primary: tid1 (bits 63-48), tid2 (bits 47-32), pos1 (bits 31-0)
356        // This ensures comparison order matches samtools: tid1, tid2, pos1
357        let p1 = (tid1_packed << 48) | (tid2_packed << 32) | pos1_packed;
358
359        // Pack secondary: pos2 (bits 63-32), neg1 (bit 1), neg2 (bit 0)
360        // Invert neg flags: samtools sorts reverse (neg=true) BEFORE forward (neg=false)
361        // By storing !neg, we get: !true=0 < !false=1, so reverse sorts first
362        let p2 = (pos2_packed << 32) | (u64::from(!neg1) << 1) | u64::from(!neg2);
363
364        // Pack tertiary: library (high 16), mi_value (middle), mi_suffix (bit 0)
365        // Note: /B suffix should sort after /A, so we use !is_a as the bit
366        let p3 = ((u64::from(library) & 0xFFFF) << 48)
367            | ((mi.0 & 0xFFFF_FFFF_FFFF) << 1)
368            | u64::from(!mi.1);
369
370        // Pack name_hash and is_upper: hash in upper 63 bits, is_upper in bit 0
371        // This ensures same-name records group together, with is_upper=false before is_upper=true
372        let p4 = (name_hash << 1) | u64::from(is_upper);
373
374        Self { primary: p1, secondary: p2, tertiary: p3, name_hash_upper: p4 }
375    }
376
377    /// Create a key for completely unmapped records.
378    #[must_use]
379    pub fn unmapped(name_hash: u64, is_read2: bool) -> Self {
380        Self {
381            primary: u64::MAX,
382            secondary: u64::MAX,
383            tertiary: 0,
384            name_hash_upper: (name_hash << 1) | u64::from(is_read2),
385        }
386    }
387
388    /// Create a zeroed key (used as dummy for memory operations).
389    #[inline]
390    #[must_use]
391    pub fn zeroed() -> Self {
392        Self { primary: 0, secondary: 0, tertiary: 0, name_hash_upper: 0 }
393    }
394}
395
396impl Default for TemplateKey {
397    fn default() -> Self {
398        Self::zeroed()
399    }
400}
401
402impl TemplateKey {
403    /// Serialize to bytes for storage in keyed temp files.
404    #[inline]
405    #[must_use]
406    pub fn to_bytes(&self) -> [u8; 32] {
407        let mut buf = [0u8; 32];
408        buf[0..8].copy_from_slice(&self.primary.to_le_bytes());
409        buf[8..16].copy_from_slice(&self.secondary.to_le_bytes());
410        buf[16..24].copy_from_slice(&self.tertiary.to_le_bytes());
411        buf[24..32].copy_from_slice(&self.name_hash_upper.to_le_bytes());
412        buf
413    }
414
415    /// Deserialize from bytes read from keyed temp files.
416    #[inline]
417    #[must_use]
418    pub fn from_bytes(buf: &[u8; 32]) -> Self {
419        Self {
420            primary: u64::from_le_bytes([
421                buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7],
422            ]),
423            secondary: u64::from_le_bytes([
424                buf[8], buf[9], buf[10], buf[11], buf[12], buf[13], buf[14], buf[15],
425            ]),
426            tertiary: u64::from_le_bytes([
427                buf[16], buf[17], buf[18], buf[19], buf[20], buf[21], buf[22], buf[23],
428            ]),
429            name_hash_upper: u64::from_le_bytes([
430                buf[24], buf[25], buf[26], buf[27], buf[28], buf[29], buf[30], buf[31],
431            ]),
432        }
433    }
434}
435
436impl PartialOrd for TemplateKey {
437    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
438        Some(self.cmp(other))
439    }
440}
441
442impl Ord for TemplateKey {
443    fn cmp(&self, other: &Self) -> Ordering {
444        self.primary
445            .cmp(&other.primary)
446            .then_with(|| self.secondary.cmp(&other.secondary))
447            .then_with(|| self.tertiary.cmp(&other.tertiary))
448            // name_hash_upper comparison handles both name grouping AND is_upper ordering
449            .then_with(|| self.name_hash_upper.cmp(&other.name_hash_upper))
450    }
451}
452
453impl TemplateKey {
454    /// Compare only core fields (tid1, tid2, pos1, pos2, neg1, neg2, library, MI).
455    ///
456    /// This ignores the `name_hash` tie-breaker, allowing verification to accept
457    /// both fgumi and samtools sorted files (which differ only in tie-breaking).
458    #[inline]
459    #[must_use]
460    pub fn core_cmp(&self, other: &Self) -> Ordering {
461        self.primary
462            .cmp(&other.primary)
463            .then_with(|| self.secondary.cmp(&other.secondary))
464            .then_with(|| self.tertiary.cmp(&other.tertiary))
465    }
466}
467
468impl RawSortKey for TemplateKey {
469    const SERIALIZED_SIZE: Option<usize> = Some(32);
470
471    fn extract(_bam: &[u8], _ctx: &SortContext) -> Self {
472        // TemplateKey extraction requires LibraryLookup context not available through the
473        // RawSortKey trait interface.  All callers use extract_template_key_inline() in raw.rs
474        // instead.  This arm exists only to satisfy the trait obligation.
475        unreachable!(
476            "TemplateKey::extract() should not be called directly. \
477             Use extract_template_key_inline() with LibraryLookup instead."
478        )
479    }
480
481    #[inline]
482    fn write_to<W: Write>(&self, writer: &mut W) -> std::io::Result<()> {
483        writer.write_all(&self.to_bytes())
484    }
485
486    #[inline]
487    fn read_from<R: Read>(reader: &mut R) -> std::io::Result<Self> {
488        let mut buf = [0u8; 32];
489        reader.read_exact(&mut buf)?;
490        Ok(Self::from_bytes(&buf))
491    }
492}
493
494/// Inline header stored before each record in the data buffer.
495/// This allows us to use a minimal ref structure while still having
496/// fast access to the full sort key.
497///
498/// Layout (40 bytes total):
499/// - primary: 8 bytes (u64)
500/// - secondary: 8 bytes (u64)
501/// - tertiary: 8 bytes (u64)
502/// - `name_hash_upper`: 8 bytes (u64)
503/// - `record_len`: 4 bytes (u32)
504/// - padding: 4 bytes (u32)
505#[repr(C)]
506#[derive(Copy, Clone, Debug)]
507pub struct TemplateInlineHeader {
508    /// Full sort key for comparison.
509    pub key: TemplateKey,
510    /// Length of following raw BAM data.
511    pub record_len: u32,
512    /// Padding for 8-byte alignment.
513    pub padding: u32,
514}
515
516/// Size of `TemplateInlineHeader` in bytes.
517pub const TEMPLATE_HEADER_SIZE: usize = 40; // 4 * 8 (key) + 4 + 4
518
519impl TemplateInlineHeader {
520    /// Write header to a byte buffer.
521    #[inline]
522    pub fn write_to(&self, buf: &mut Vec<u8>) {
523        buf.extend_from_slice(&self.key.primary.to_le_bytes());
524        buf.extend_from_slice(&self.key.secondary.to_le_bytes());
525        buf.extend_from_slice(&self.key.tertiary.to_le_bytes());
526        buf.extend_from_slice(&self.key.name_hash_upper.to_le_bytes());
527        buf.extend_from_slice(&self.record_len.to_le_bytes());
528        buf.extend_from_slice(&self.padding.to_le_bytes());
529    }
530
531    /// Read header from a byte slice.
532    #[inline]
533    #[must_use]
534    pub fn read_from(data: &[u8]) -> Self {
535        let primary = u64::from_le_bytes([
536            data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
537        ]);
538        let secondary = u64::from_le_bytes([
539            data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15],
540        ]);
541        let tertiary = u64::from_le_bytes([
542            data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
543        ]);
544        let name_hash_upper = u64::from_le_bytes([
545            data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
546        ]);
547        let record_len = u32::from_le_bytes([data[32], data[33], data[34], data[35]]);
548
549        Self {
550            key: TemplateKey { primary, secondary, tertiary, name_hash_upper },
551            record_len,
552            padding: 0,
553        }
554    }
555}
556
557/// Record reference for template-coordinate sorting with cached key.
558///
559/// Caches the full `TemplateKey` inline for O(1) comparisons during sort.
560/// This trades memory (48 bytes vs 16 bytes per ref) for cache locality -
561/// all comparison data is in the ref itself, avoiding random access to
562/// the multi-GB data buffer during sorting.
563#[repr(C)]
564#[derive(Copy, Clone, Debug)]
565pub struct TemplateRecordRef {
566    /// Cached sort key for O(1) comparisons without data buffer access.
567    pub key: TemplateKey,
568    /// Offset to inline header in data buffer.
569    pub offset: u64,
570    /// Length of raw BAM data (excluding inline header).
571    pub len: u32,
572    /// Padding for alignment.
573    pub padding: u32,
574}
575
576impl PartialEq for TemplateRecordRef {
577    fn eq(&self, other: &Self) -> bool {
578        self.offset == other.offset
579    }
580}
581
582impl Eq for TemplateRecordRef {}
583
584impl PartialOrd for TemplateRecordRef {
585    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
586        Some(self.cmp(other))
587    }
588}
589
590impl Ord for TemplateRecordRef {
591    fn cmp(&self, other: &Self) -> Ordering {
592        // Note: This Ord impl is NOT used for sorting - we use sort_by with data access
593        self.offset.cmp(&other.offset)
594    }
595}
596
597/// Template-coordinate record buffer with inline headers.
598///
599/// Uses inline headers to store full sort keys in the data buffer,
600/// allowing minimal refs (16 bytes) while maintaining fast comparison.
601///
602/// Memory layout:
603/// ```text
604/// data: [Header0][Record0][Header1][Record1]...
605/// refs: [Ref0][Ref1]...
606/// ```
607///
608/// During sorting, we use a custom comparator that:
609/// 1. Compares primary keys from refs (fast, O(1))
610/// 2. On ties, fetches full keys from inline headers
611pub struct TemplateRecordBuffer {
612    /// Raw byte storage: inline headers + record data.
613    data: Vec<u8>,
614    /// Minimal index for sorting.
615    refs: Vec<TemplateRecordRef>,
616}
617
618impl TemplateRecordBuffer {
619    /// Create a new buffer with estimated capacity.
620    #[must_use]
621    pub fn with_capacity(estimated_records: usize, estimated_bytes: usize) -> Self {
622        // Account for inline headers in data capacity
623        let header_bytes = estimated_records * TEMPLATE_HEADER_SIZE;
624        Self {
625            data: Vec::with_capacity(estimated_bytes + header_bytes),
626            refs: Vec::with_capacity(estimated_records),
627        }
628    }
629
630    /// Push a record with a pre-computed template key.
631    ///
632    /// # Panics
633    ///
634    /// Panics if the record length exceeds `u32::MAX`.
635    #[inline]
636    pub fn push(&mut self, record: &[u8], key: TemplateKey) {
637        let offset = self.data.len() as u64;
638        let record_len = u32::try_from(record.len()).expect("record length exceeds u32::MAX");
639
640        // Write inline header using manual byte operations (avoids alignment issues)
641        let header = TemplateInlineHeader { key, record_len, padding: 0 };
642        header.write_to(&mut self.data);
643
644        // Write raw BAM data
645        self.data.extend_from_slice(record);
646
647        // Add ref with cached key for O(1) sort comparisons
648        self.refs.push(TemplateRecordRef { key, offset, len: record_len, padding: 0 });
649    }
650
651    /// Sort the index by cached key using stable LSD radix sort.
652    ///
653    /// Uses multi-field radix sort which is stable (preserves relative order
654    /// of records with equal keys). This ensures deterministic output that
655    /// matches samtools when records have identical sort keys.
656    pub fn sort(&mut self) {
657        radix_sort_template_refs(&mut self.refs);
658    }
659
660    /// Sort using parallel radix sort with stable k-way merge.
661    ///
662    /// Each chunk is sorted with stable radix sort, then merged with a
663    /// heap that uses `chunk_idx` as tie-breaker to preserve input order.
664    pub fn par_sort(&mut self) {
665        parallel_radix_sort_template_refs(&mut self.refs);
666    }
667
668    /// Get record bytes by reference.
669    #[inline]
670    #[must_use]
671    #[allow(clippy::cast_possible_truncation)]
672    pub fn get_record(&self, r: &TemplateRecordRef) -> &[u8] {
673        let start = r.offset as usize + TEMPLATE_HEADER_SIZE;
674        let end = start + r.len as usize;
675        &self.data[start..end]
676    }
677
678    /// Iterate over sorted records.
679    pub fn iter_sorted(&self) -> impl Iterator<Item = &[u8]> {
680        self.refs.iter().map(|r| self.get_record(r))
681    }
682
683    /// Get the sorted record references.
684    #[must_use]
685    pub fn refs(&self) -> &[TemplateRecordRef] {
686        &self.refs
687    }
688
689    /// Memory usage in bytes (actual data stored, not capacity).
690    #[must_use]
691    pub fn memory_usage(&self) -> usize {
692        self.data.len() + self.refs.len() * std::mem::size_of::<TemplateRecordRef>()
693    }
694
695    /// Number of records.
696    #[must_use]
697    pub fn len(&self) -> usize {
698        self.refs.len()
699    }
700
701    /// Check if buffer is empty.
702    #[must_use]
703    pub fn is_empty(&self) -> bool {
704        self.refs.is_empty()
705    }
706
707    /// Get the key for a record reference (returns cached key from ref).
708    #[inline]
709    #[must_use]
710    pub fn get_key(&self, r: &TemplateRecordRef) -> TemplateKey {
711        r.key
712    }
713
714    /// Iterate over sorted (key, record) pairs.
715    /// Used for writing keyed temp chunks that preserve sort keys.
716    pub fn iter_sorted_keyed(&self) -> impl Iterator<Item = (TemplateKey, &[u8])> {
717        self.refs.iter().map(|r| (self.get_key(r), self.get_record(r)))
718    }
719
720    /// Clear the buffer for reuse.
721    pub fn clear(&mut self) {
722        self.data.clear();
723        self.refs.clear();
724    }
725
726    /// Get underlying data buffer (for direct access to raw bytes).
727    #[must_use]
728    pub fn data(&self) -> &[u8] {
729        &self.data
730    }
731}
732
733// ============================================================================
734// Radix Sort for RecordRef
735// ============================================================================
736
737/// Threshold below which we use insertion sort instead of radix sort.
738const RADIX_THRESHOLD: usize = 256;
739
740/// Radix sort for `RecordRef` arrays using LSD (Least Significant Digit) approach.
741///
742/// Sorts by the `sort_key` field using 8-bit radix (256 buckets).
743/// This is O(n×k) where k is the number of bytes to sort (typically 5-8).
744///
745/// # Stability
746/// Radix sort is inherently stable - records with equal keys maintain their
747/// relative input order, matching samtools behavior.
748#[allow(clippy::uninit_vec, unsafe_code)]
749pub fn radix_sort_record_refs(refs: &mut [RecordRef]) {
750    let n = refs.len();
751    if n < RADIX_THRESHOLD {
752        // Use insertion sort for small arrays
753        insertion_sort_refs(refs);
754        return;
755    }
756
757    // Find max key to determine how many bytes we need to sort
758    let max_key = refs.iter().map(|r| r.sort_key).max().unwrap_or(0);
759    let bytes_needed =
760        if max_key == 0 { 0 } else { ((64 - max_key.leading_zeros()) as usize).div_ceil(8) };
761
762    if bytes_needed == 0 {
763        return; // All keys are 0, already sorted
764    }
765
766    // Allocate auxiliary buffer
767    let mut aux: Vec<RecordRef> = Vec::with_capacity(n);
768    unsafe {
769        aux.set_len(n);
770    }
771
772    let mut src = refs as *mut [RecordRef];
773    let mut dst = aux.as_mut_slice() as *mut [RecordRef];
774
775    // LSD radix sort - byte by byte from least significant
776    for byte_idx in 0..bytes_needed {
777        let src_slice = unsafe { &*src };
778        let dst_slice = unsafe { &mut *dst };
779
780        // Count occurrences of each byte value
781        let mut counts = [0usize; 256];
782        for r in src_slice {
783            let byte = ((r.sort_key >> (byte_idx * 8)) & 0xFF) as usize;
784            counts[byte] += 1;
785        }
786
787        // Convert to cumulative offsets
788        let mut total = 0;
789        for count in &mut counts {
790            let c = *count;
791            *count = total;
792            total += c;
793        }
794
795        // Scatter elements to destination (stable - preserves order within buckets)
796        for r in src_slice {
797            let byte = ((r.sort_key >> (byte_idx * 8)) & 0xFF) as usize;
798            let dest_idx = counts[byte];
799            counts[byte] += 1;
800            dst_slice[dest_idx] = *r;
801        }
802
803        // Swap src and dst
804        std::mem::swap(&mut src, &mut dst);
805    }
806
807    // If odd number of passes, copy back to original buffer
808    if bytes_needed % 2 == 1 {
809        let src_slice = unsafe { &*src };
810        refs.copy_from_slice(src_slice);
811    }
812}
813
814/// Parallel radix sort for `RecordRef` arrays.
815///
816/// Divides the array into chunks, sorts each chunk with radix sort,
817/// then performs k-way merge. This provides near-linear speedup.
818pub fn parallel_radix_sort_record_refs(refs: &mut [RecordRef]) {
819    use rayon::prelude::*;
820
821    let n = refs.len();
822    if n < RADIX_THRESHOLD * 2 {
823        // Small array - just use single-threaded radix sort
824        radix_sort_record_refs(refs);
825        return;
826    }
827
828    // Get number of threads from rayon
829    let n_threads = rayon::current_num_threads();
830
831    // For very large arrays, parallel chunked sort + merge is faster
832    if n_threads > 1 && n > 10_000 {
833        let chunk_size = n.div_ceil(n_threads);
834
835        // Sort each chunk in parallel using radix sort
836        refs.par_chunks_mut(chunk_size).for_each(|chunk| {
837            radix_sort_record_refs(chunk);
838        });
839
840        // K-way merge the sorted chunks
841        // For simplicity, we use a heap-based merge into auxiliary storage
842        let chunk_boundaries: Vec<_> = refs
843            .chunks(chunk_size)
844            .scan(0, |pos, chunk| {
845                let start = *pos;
846                *pos += chunk.len();
847                Some(start..*pos)
848            })
849            .collect();
850
851        merge_sorted_chunks(refs, &chunk_boundaries);
852    } else {
853        // Single-threaded radix sort
854        radix_sort_record_refs(refs);
855    }
856}
857
858/// Merge k sorted chunks in place using auxiliary storage.
859fn merge_sorted_chunks(refs: &mut [RecordRef], chunk_ranges: &[std::ops::Range<usize>]) {
860    use crate::sort::radix::{heap_make, heap_sift_down};
861
862    // Heap entry: (sort_key, chunk_idx, position_in_chunk)
863    struct HeapEntry {
864        key: u64,
865        chunk_idx: usize,
866        pos: usize,
867    }
868
869    if chunk_ranges.len() <= 1 {
870        return;
871    }
872
873    let n = refs.len();
874    let mut result: Vec<RecordRef> = Vec::with_capacity(n);
875
876    // Initialize heap with first element from each chunk
877    let mut heap: Vec<HeapEntry> = Vec::with_capacity(chunk_ranges.len());
878    for (chunk_idx, range) in chunk_ranges.iter().enumerate() {
879        if !range.is_empty() {
880            heap.push(HeapEntry { key: refs[range.start].sort_key, chunk_idx, pos: range.start });
881        }
882    }
883
884    if heap.is_empty() {
885        return;
886    }
887
888    // Min-heap: smaller keys should be at the top
889    // Use `chunk_idx` as tie-breaker for stability
890    let lt = |a: &HeapEntry, b: &HeapEntry| -> bool { (a.key, a.chunk_idx) > (b.key, b.chunk_idx) };
891
892    heap_make(&mut heap, &lt);
893    let mut heap_size = heap.len();
894
895    // Merge
896    while heap_size > 0 {
897        let entry = &heap[0];
898        result.push(refs[entry.pos]);
899
900        let chunk_idx = entry.chunk_idx;
901        let next_pos = entry.pos + 1;
902        let range = &chunk_ranges[chunk_idx];
903
904        if next_pos < range.end {
905            // More elements in this chunk
906            heap[0] = HeapEntry { key: refs[next_pos].sort_key, chunk_idx, pos: next_pos };
907            heap_sift_down(&mut heap, 0, heap_size, &lt);
908        } else {
909            // Chunk exhausted
910            heap_size -= 1;
911            if heap_size > 0 {
912                heap.swap(0, heap_size);
913                heap_sift_down(&mut heap, 0, heap_size, &lt);
914            }
915        }
916    }
917
918    // Copy result back
919    refs.copy_from_slice(&result);
920}
921
922/// Binary insertion sort for small arrays of `RecordRef`.
923fn insertion_sort_refs(refs: &mut [RecordRef]) {
924    for i in 1..refs.len() {
925        let key = refs[i].sort_key;
926        let insert_pos = refs[..i].partition_point(|r| r.sort_key <= key);
927        if insert_pos < i {
928            refs[insert_pos..=i].rotate_right(1);
929        }
930    }
931}
932
933// ============================================================================
934// Radix Sort for TemplateRecordRef (multi-field 4×u64 key)
935// ============================================================================
936
937/// Radix sort for `TemplateRecordRef` arrays using multi-field LSD approach.
938///
939/// The `TemplateKey` consists of 4 u64 fields sorted in order:
940/// primary → secondary → tertiary → `name_hash_upper`
941///
942/// For LSD radix sort, we sort from least significant to most significant:
943/// 1. Sort by `name_hash_upper`
944/// 2. Sort by tertiary (stable, preserves `name_hash_upper` order)
945/// 3. Sort by secondary (stable, preserves tertiary + `name_hash_upper` order)
946/// 4. Sort by primary (stable, final order)
947///
948/// This is O(n×k) where k is the total bytes to sort (up to 32 bytes).
949///
950/// This is used for stable sorting - LSD radix sort inherently preserves
951/// relative order of records with equal keys.
952#[allow(clippy::uninit_vec, unsafe_code)]
953pub fn radix_sort_template_refs(refs: &mut [TemplateRecordRef]) {
954    let n = refs.len();
955    if n < RADIX_THRESHOLD {
956        insertion_sort_template_refs(refs);
957        return;
958    }
959
960    // Allocate auxiliary buffer
961    let mut aux: Vec<TemplateRecordRef> = Vec::with_capacity(n);
962    unsafe {
963        aux.set_len(n);
964    }
965
966    // Sort by each field from least significant to most significant
967    // This ensures the final order is: primary → secondary → tertiary → name_hash_upper
968    let fields = [
969        |r: &TemplateRecordRef| r.key.name_hash_upper,
970        |r: &TemplateRecordRef| r.key.tertiary,
971        |r: &TemplateRecordRef| r.key.secondary,
972        |r: &TemplateRecordRef| r.key.primary,
973    ];
974
975    for (field_idx, get_field) in fields.iter().enumerate() {
976        // Find max value for this field to determine bytes needed
977        let max_val = refs.iter().map(get_field).max().unwrap_or(0);
978        let bytes_needed =
979            if max_val == 0 { 0 } else { ((64 - max_val.leading_zeros()) as usize).div_ceil(8) };
980
981        if bytes_needed == 0 {
982            continue; // All values are 0 for this field, skip
983        }
984
985        // Radix sort this field
986        radix_sort_template_field(refs, &mut aux, get_field, bytes_needed, field_idx);
987    }
988}
989
990/// Radix sort a single u64 field of `TemplateRecordRef` using raw pointers.
991#[allow(clippy::uninit_vec, unsafe_code)]
992fn radix_sort_template_field<F>(
993    refs: &mut [TemplateRecordRef],
994    aux: &mut [TemplateRecordRef],
995    get_field: F,
996    bytes_needed: usize,
997    _field_idx: usize,
998) where
999    F: Fn(&TemplateRecordRef) -> u64,
1000{
1001    let n = refs.len();
1002
1003    // Use raw pointers to avoid borrow checker issues with swapping
1004    let mut src = refs as *mut [TemplateRecordRef];
1005    let mut dst = aux as *mut [TemplateRecordRef];
1006
1007    for byte_idx in 0..bytes_needed {
1008        let src_slice = unsafe { &*src };
1009        let dst_slice = unsafe { &mut *dst };
1010
1011        // Count occurrences of each byte value
1012        let mut counts = [0usize; 256];
1013        for r in src_slice {
1014            let byte = ((get_field(r) >> (byte_idx * 8)) & 0xFF) as usize;
1015            counts[byte] += 1;
1016        }
1017
1018        // Convert to cumulative offsets
1019        let mut total = 0;
1020        for count in &mut counts {
1021            let c = *count;
1022            *count = total;
1023            total += c;
1024        }
1025
1026        // Scatter elements to destination
1027        for item in src_slice.iter().take(n) {
1028            let byte = ((get_field(item) >> (byte_idx * 8)) & 0xFF) as usize;
1029            let dest_idx = counts[byte];
1030            counts[byte] += 1;
1031            dst_slice[dest_idx] = *item;
1032        }
1033
1034        // Swap src and dst
1035        std::mem::swap(&mut src, &mut dst);
1036    }
1037
1038    // If odd number of passes, copy back to original buffer
1039    if bytes_needed % 2 == 1 {
1040        let src_slice = unsafe { &*src };
1041        refs.copy_from_slice(src_slice);
1042    }
1043}
1044
1045/// Parallel radix sort for `TemplateRecordRef` arrays.
1046///
1047/// Uses stable radix sort per chunk with stable k-way merge (`chunk_idx` tie-breaker).
1048pub fn parallel_radix_sort_template_refs(refs: &mut [TemplateRecordRef]) {
1049    use rayon::prelude::*;
1050
1051    let n = refs.len();
1052    if n < RADIX_THRESHOLD * 2 {
1053        radix_sort_template_refs(refs);
1054        return;
1055    }
1056
1057    let n_threads = rayon::current_num_threads();
1058
1059    if n_threads > 1 && n > 10_000 {
1060        let chunk_size = n.div_ceil(n_threads);
1061
1062        // Sort each chunk in parallel
1063        refs.par_chunks_mut(chunk_size).for_each(|chunk| {
1064            radix_sort_template_refs(chunk);
1065        });
1066
1067        // K-way merge the sorted chunks
1068        let chunk_boundaries: Vec<_> = refs
1069            .chunks(chunk_size)
1070            .scan(0, |pos, chunk| {
1071                let start = *pos;
1072                *pos += chunk.len();
1073                Some(start..*pos)
1074            })
1075            .collect();
1076
1077        merge_sorted_template_chunks(refs, &chunk_boundaries);
1078    } else {
1079        radix_sort_template_refs(refs);
1080    }
1081}
1082
1083/// Merge k sorted chunks of `TemplateRecordRef` in place.
1084///
1085/// Uses `chunk_idx` as tie-breaker for stable merge (preserves input order for equal keys).
1086fn merge_sorted_template_chunks(
1087    refs: &mut [TemplateRecordRef],
1088    chunk_ranges: &[std::ops::Range<usize>],
1089) {
1090    use crate::sort::radix::{heap_make, heap_sift_down};
1091
1092    struct HeapEntry {
1093        key: TemplateKey,
1094        chunk_idx: usize,
1095        pos: usize,
1096    }
1097
1098    if chunk_ranges.len() <= 1 {
1099        return;
1100    }
1101
1102    let n = refs.len();
1103    let mut result: Vec<TemplateRecordRef> = Vec::with_capacity(n);
1104
1105    let mut heap: Vec<HeapEntry> = Vec::with_capacity(chunk_ranges.len());
1106    for (chunk_idx, range) in chunk_ranges.iter().enumerate() {
1107        if !range.is_empty() {
1108            heap.push(HeapEntry { key: refs[range.start].key, chunk_idx, pos: range.start });
1109        }
1110    }
1111
1112    if heap.is_empty() {
1113        return;
1114    }
1115
1116    // Min-heap with chunk_idx tie-breaker for stability
1117    let lt = |a: &HeapEntry, b: &HeapEntry| -> bool {
1118        match a.key.cmp(&b.key) {
1119            std::cmp::Ordering::Greater => true,
1120            std::cmp::Ordering::Less => false,
1121            std::cmp::Ordering::Equal => a.chunk_idx > b.chunk_idx,
1122        }
1123    };
1124
1125    heap_make(&mut heap, &lt);
1126    let mut heap_size = heap.len();
1127
1128    while heap_size > 0 {
1129        let entry = &heap[0];
1130        result.push(refs[entry.pos]);
1131
1132        let chunk_idx = entry.chunk_idx;
1133        let next_pos = entry.pos + 1;
1134        let range = &chunk_ranges[chunk_idx];
1135
1136        if next_pos < range.end {
1137            heap[0] = HeapEntry { key: refs[next_pos].key, chunk_idx, pos: next_pos };
1138            heap_sift_down(&mut heap, 0, heap_size, &lt);
1139        } else {
1140            heap_size -= 1;
1141            if heap_size > 0 {
1142                heap.swap(0, heap_size);
1143                heap_sift_down(&mut heap, 0, heap_size, &lt);
1144            }
1145        }
1146    }
1147
1148    refs.copy_from_slice(&result);
1149}
1150
1151/// Binary insertion sort for small arrays of `TemplateRecordRef`.
1152fn insertion_sort_template_refs(refs: &mut [TemplateRecordRef]) {
1153    for i in 1..refs.len() {
1154        let key = &refs[i].key;
1155        let insert_pos = refs[..i].partition_point(|r| r.key <= *key);
1156        if insert_pos < i {
1157            refs[insert_pos..=i].rotate_right(1);
1158        }
1159    }
1160}
1161
1162#[cfg(test)]
1163mod tests {
1164    use super::*;
1165
1166    #[test]
1167    fn test_packed_coord_key_ordering() {
1168        // Lower tid should come first
1169        assert!(PackedCoordKey::new(0, 100, false, 10) < PackedCoordKey::new(1, 100, false, 10));
1170
1171        // Lower pos should come first
1172        assert!(PackedCoordKey::new(0, 100, false, 10) < PackedCoordKey::new(0, 200, false, 10));
1173
1174        // Forward should come before reverse (false < true)
1175        assert!(PackedCoordKey::new(0, 100, false, 10) < PackedCoordKey::new(0, 100, true, 10));
1176
1177        // Unmapped should come last
1178        assert!(PackedCoordKey::new(9, 1_000_000, true, 10) < PackedCoordKey::unmapped());
1179    }
1180
1181    #[test]
1182    fn test_template_key_ordering() {
1183        let k1 = TemplateKey::new(0, 100, false, 0, 200, false, 0, (1, true), 0, false);
1184        let k2 = TemplateKey::new(0, 100, false, 0, 200, false, 0, (2, true), 0, false);
1185        assert!(k1 < k2);
1186
1187        // /A suffix should come before /B
1188        let ka = TemplateKey::new(0, 100, false, 0, 200, false, 0, (1, true), 0, false);
1189        let kb = TemplateKey::new(0, 100, false, 0, 200, false, 0, (1, false), 0, false);
1190        assert!(ka < kb);
1191
1192        // Same name hash: is_upper=false should come before is_upper=true
1193        let lower = TemplateKey::new(0, 100, false, 0, 200, false, 0, (1, true), 12345, false);
1194        let upper = TemplateKey::new(0, 100, false, 0, 200, false, 0, (1, true), 12345, true);
1195        assert!(lower < upper, "is_upper=false should sort before is_upper=true");
1196
1197        // Different name hashes should group separately
1198        let first_hash_lo =
1199            TemplateKey::new(0, 100, false, 0, 200, false, 0, (1, true), 100, false);
1200        let first_hash_hi = TemplateKey::new(0, 100, false, 0, 200, false, 0, (1, true), 100, true);
1201        let second_hash = TemplateKey::new(0, 100, false, 0, 200, false, 0, (1, true), 200, false);
1202        // first_hash records should come before second_hash records
1203        assert!(first_hash_lo < second_hash);
1204        assert!(first_hash_hi < second_hash);
1205    }
1206
1207    #[test]
1208    fn test_radix_sort_record_refs() {
1209        // Create test refs with various keys
1210        let mut refs = vec![
1211            RecordRef { sort_key: 100, offset: 0, len: 10, padding: 0 },
1212            RecordRef { sort_key: 50, offset: 100, len: 10, padding: 0 },
1213            RecordRef { sort_key: 200, offset: 200, len: 10, padding: 0 },
1214            RecordRef { sort_key: 50, offset: 300, len: 10, padding: 0 }, // Duplicate key
1215            RecordRef { sort_key: 1, offset: 400, len: 10, padding: 0 },
1216        ];
1217
1218        radix_sort_record_refs(&mut refs);
1219
1220        // Verify sorted order
1221        assert_eq!(refs[0].sort_key, 1);
1222        assert_eq!(refs[1].sort_key, 50);
1223        assert_eq!(refs[2].sort_key, 50);
1224        assert_eq!(refs[3].sort_key, 100);
1225        assert_eq!(refs[4].sort_key, 200);
1226
1227        // Verify stability: duplicate keys should maintain original order
1228        // offset=100 was before offset=300 in input, should remain so
1229        assert_eq!(refs[1].offset, 100);
1230        assert_eq!(refs[2].offset, 300);
1231    }
1232
1233    #[test]
1234    #[allow(clippy::cast_sign_loss)]
1235    fn test_radix_sort_large() {
1236        // Test with larger array to trigger radix sort (> RADIX_THRESHOLD)
1237        let mut refs: Vec<RecordRef> = (0..1000)
1238            .map(|i| RecordRef {
1239                sort_key: (999 - i) as u64, // Reverse order
1240                offset: i as u64 * 100,
1241                len: 10,
1242                padding: 0,
1243            })
1244            .collect();
1245
1246        radix_sort_record_refs(&mut refs);
1247
1248        // Verify sorted
1249        for (i, r) in refs.iter().enumerate() {
1250            assert_eq!(r.sort_key, i as u64, "Expected sort_key {i} at index {i}");
1251        }
1252    }
1253
1254    #[test]
1255    fn test_radix_sort_empty() {
1256        let mut refs: Vec<RecordRef> = Vec::new();
1257        radix_sort_record_refs(&mut refs);
1258        assert!(refs.is_empty());
1259    }
1260
1261    #[test]
1262    fn test_radix_sort_single() {
1263        let mut refs = vec![RecordRef { sort_key: 42, offset: 0, len: 10, padding: 0 }];
1264        radix_sort_record_refs(&mut refs);
1265        assert_eq!(refs.len(), 1);
1266        assert_eq!(refs[0].sort_key, 42);
1267    }
1268
1269    #[test]
1270    #[allow(clippy::cast_sign_loss)]
1271    fn test_radix_sort_all_same_keys() {
1272        // All same keys should maintain original order (stability)
1273        let mut refs: Vec<RecordRef> = (0..100)
1274            .map(|i| RecordRef { sort_key: 42, offset: i as u64 * 100, len: 10, padding: 0 })
1275            .collect();
1276
1277        radix_sort_record_refs(&mut refs);
1278
1279        // Verify all keys are 42 and order is preserved
1280        for (i, r) in refs.iter().enumerate() {
1281            assert_eq!(r.sort_key, 42);
1282            assert_eq!(r.offset, i as u64 * 100, "Stability violated at index {i}");
1283        }
1284    }
1285
1286    #[test]
1287    #[allow(clippy::cast_sign_loss)]
1288    fn test_radix_sort_all_zero_keys() {
1289        let mut refs: Vec<RecordRef> = (0..50)
1290            .map(|i| RecordRef { sort_key: 0, offset: i as u64 * 100, len: 10, padding: 0 })
1291            .collect();
1292
1293        radix_sort_record_refs(&mut refs);
1294
1295        // Should be stable for all-zero keys
1296        for (i, r) in refs.iter().enumerate() {
1297            assert_eq!(r.sort_key, 0);
1298            assert_eq!(r.offset, i as u64 * 100);
1299        }
1300    }
1301
1302    #[test]
1303    fn test_radix_sort_max_key() {
1304        // Test with maximum u64 values
1305        let mut refs = vec![
1306            RecordRef { sort_key: u64::MAX, offset: 0, len: 10, padding: 0 },
1307            RecordRef { sort_key: 0, offset: 100, len: 10, padding: 0 },
1308            RecordRef { sort_key: u64::MAX / 2, offset: 200, len: 10, padding: 0 },
1309        ];
1310
1311        radix_sort_record_refs(&mut refs);
1312
1313        assert_eq!(refs[0].sort_key, 0);
1314        assert_eq!(refs[1].sort_key, u64::MAX / 2);
1315        assert_eq!(refs[2].sort_key, u64::MAX);
1316    }
1317
1318    #[test]
1319    #[allow(clippy::cast_sign_loss)]
1320    fn test_parallel_radix_sort() {
1321        // Test parallel sort with enough elements to trigger parallelism
1322        let mut refs: Vec<RecordRef> = (0..50_000)
1323            .map(|i| RecordRef {
1324                sort_key: (49_999 - i) as u64, // Reverse order
1325                offset: i as u64 * 100,
1326                len: 10,
1327                padding: 0,
1328            })
1329            .collect();
1330
1331        parallel_radix_sort_record_refs(&mut refs);
1332
1333        // Verify sorted
1334        for (i, r) in refs.iter().enumerate() {
1335            assert_eq!(r.sort_key, i as u64, "Expected sort_key {i} at index {i}");
1336        }
1337    }
1338
1339    #[test]
1340    #[allow(clippy::cast_sign_loss)]
1341    fn test_parallel_radix_sort_stability() {
1342        // Test that parallel sort maintains stability for equal keys
1343        let mut refs: Vec<RecordRef> = (0..20_000)
1344            .map(|i| RecordRef {
1345                sort_key: (i / 100) as u64, // Groups of 100 with same key
1346                offset: i as u64,           // Use offset to track original order
1347                len: 10,
1348                padding: 0,
1349            })
1350            .collect();
1351
1352        parallel_radix_sort_record_refs(&mut refs);
1353
1354        // Verify sorted and stable within groups
1355        for i in 1..refs.len() {
1356            assert!(refs[i - 1].sort_key <= refs[i].sort_key, "Not sorted at index {i}");
1357            // Within same key group, offsets should be in order
1358            if refs[i - 1].sort_key == refs[i].sort_key {
1359                assert!(refs[i - 1].offset < refs[i].offset, "Stability violated at index {i}");
1360            }
1361        }
1362    }
1363
1364    #[test]
1365    #[allow(clippy::cast_sign_loss)]
1366    fn test_radix_sort_template_refs_stability() {
1367        // Test that template radix sort is stable for equal keys
1368        // Create refs with identical TemplateKey but different offsets to track order
1369        let key = TemplateKey::new(0, 100, false, 0, 200, false, 0, (1, true), 12345, false);
1370
1371        let mut refs: Vec<TemplateRecordRef> = (0..500)
1372            .map(|i| TemplateRecordRef {
1373                key,              // All same key
1374                offset: i as u64, // Use offset to track original order
1375                len: 10,
1376                padding: 0,
1377            })
1378            .collect();
1379
1380        radix_sort_template_refs(&mut refs);
1381
1382        // All keys equal, so offsets should maintain original order (stability)
1383        for i in 1..refs.len() {
1384            assert!(
1385                refs[i - 1].offset < refs[i].offset,
1386                "Template radix sort stability violated at index {}: offset {} should be < {}",
1387                i,
1388                refs[i - 1].offset,
1389                refs[i].offset
1390            );
1391        }
1392    }
1393
1394    #[test]
1395    #[allow(clippy::cast_sign_loss)]
1396    fn test_parallel_radix_sort_template_refs_stability() {
1397        // Test that parallel template sort maintains stability for equal keys
1398        // Use groups of records with same key to verify stability within groups
1399        let mut refs: Vec<TemplateRecordRef> = (0..20_000)
1400            .map(|i| {
1401                // Create groups of 100 with same key (different lib_name_hash for each group)
1402                let group = i / 100;
1403                let key = TemplateKey::new(
1404                    0,
1405                    100,
1406                    false,
1407                    0,
1408                    200,
1409                    false,
1410                    0,
1411                    (1, true),
1412                    group as u64, // Different hash per group
1413                    false,
1414                );
1415                TemplateRecordRef {
1416                    key,
1417                    offset: i as u64, // Use offset to track original order
1418                    len: 10,
1419                    padding: 0,
1420                }
1421            })
1422            .collect();
1423
1424        parallel_radix_sort_template_refs(&mut refs);
1425
1426        // Verify sorted by key and stable within groups
1427        for i in 1..refs.len() {
1428            let prev_key = &refs[i - 1].key;
1429            let curr_key = &refs[i].key;
1430            assert!(prev_key <= curr_key, "Not sorted at index {i}");
1431            // Within same key group, offsets should be in order (stability)
1432            if prev_key == curr_key {
1433                assert!(
1434                    refs[i - 1].offset < refs[i].offset,
1435                    "Parallel template sort stability violated at index {}: offset {} should be < {}",
1436                    i,
1437                    refs[i - 1].offset,
1438                    refs[i].offset
1439                );
1440            }
1441        }
1442    }
1443
1444    #[test]
1445    fn test_template_record_buffer_sort_stability() {
1446        // Test that TemplateRecordBuffer::sort() is stable
1447        // This is the method actually used during template-coordinate sorting
1448        let mut buffer = TemplateRecordBuffer::with_capacity(100, 10000);
1449
1450        // Add records with identical keys - they should maintain insertion order after sort
1451        let key = TemplateKey::new(0, 100, false, 0, 200, false, 0, (1, true), 12345, false);
1452
1453        // Create distinct records (different sequence bytes) with same sort key
1454        for i in 0..100u8 {
1455            // Minimal valid BAM record: 4 bytes block_size prefix not included in our data
1456            // ref_id (4) + pos (4) + name_len (1) + mapq (1) + bin (2) + n_cigar (2) + flag (2) + seq_len (4)
1457            // + mate_ref_id (4) + mate_pos (4) + tlen (4) + name + cigar + seq + qual
1458            let mut record = vec![
1459                0, 0, 0, 0, // ref_id = 0
1460                100, 0, 0, 0, // pos = 100
1461                2, // name_len = 2 (including null)
1462                0, // mapq = 0
1463                0, 0, // bin
1464                0, 0, // n_cigar_op = 0
1465                99, 0, // flag = 99 (paired, proper, mate reverse, first)
1466                1, 0, 0, 0, // seq_len = 1
1467                0, 0, 0, 0, // mate_ref_id = 0
1468                200, 0, 0, 0, // mate_pos = 200
1469                0, 0, 0, 0, // tlen = 0
1470                b'A', 0, // read name "A\0"
1471                // no cigar
1472                i,    // seq (1 byte for 2 bases) - use i to make records distinguishable
1473                0xFF, // qual
1474            ];
1475            // Pad to make it valid
1476            while record.len() < 40 {
1477                record.push(0);
1478            }
1479            buffer.push(&record, key);
1480        }
1481
1482        buffer.sort();
1483
1484        // Verify records maintain insertion order (tracked by sequence byte value)
1485        let mut prev_seq_byte = None;
1486        for rec in buffer.iter_sorted() {
1487            // Extract the distinguishing byte (seq field at offset 34 after header parsing)
1488            let seq_byte = rec.get(34).copied().unwrap_or(0);
1489            if let Some(prev) = prev_seq_byte {
1490                assert!(
1491                    prev < seq_byte,
1492                    "TemplateRecordBuffer::sort() stability violated: {prev} should be < {seq_byte}"
1493                );
1494            }
1495            prev_seq_byte = Some(seq_byte);
1496        }
1497    }
1498}