riker-ngs 0.2.0

Fast Rust CLI toolkit for sequencing QC metrics
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
//! SAM/BAM/CRAM readers — sequential and indexed flavors. Both live in
//! one module because they share the same `RikerRecordRequirements`-
//! driven decode logic.
//!
//! ## Two types, not one
//!
//! Originally these were a single enum with an indexed/sequential
//! variant per format. That broke down on the parallel reader path:
//! noodles' `bam::io::IndexedReader` stores its index as
//! `Box<dyn BinningIndex>`, which is not `Send`, so a single enum that
//! could ever hold an indexed variant cannot be sent to a worker thread
//! via `std::thread::scope::spawn`. Splitting keeps
//! [`AlignmentReader`] unconditionally `Send` for the parallel pipeline
//! while still letting [`IndexedAlignmentReader`] expose region queries
//! to standalone callers (currently just `error`).
//!
//! ## Reading paths
//!
//! [`AlignmentReader`] (sequential, front-to-back) exposes both:
//!
//! - [`fill_record`](AlignmentReader::fill_record) — read in place into a
//!   pre-allocated [`RikerRecord`] slot. Works for every format.
//! - [`riker_records`](AlignmentReader::riker_records) — owned-record
//!   iterator. Works for every format. Use for low-volume callers where
//!   per-record allocation isn't the bottleneck.
//!
//! [`IndexedAlignmentReader`] is region-query-only:
//!
//! - [`query_for_each`](IndexedAlignmentReader::query_for_each) — stream
//!   records overlapping a region to a callback, reusing one scratch
//!   slot for the whole region.
//!
//! Both consult a [`RikerRecordRequirements`] passed by the caller. On
//! BAM and CRAM (where aux decode is lazy) the requirements gate which
//! decoders run; on SAM noodles decodes eagerly so requirements are
//! informational.
//!
//! ## CRAM via rust-htslib
//!
//! CRAM reads go through `rust-htslib` rather than `noodles-cram`:
//! benchmarks on a 3.1 GB exome CRAM showed htslib decoding ~3.8× faster
//! than noodles-cram (1.05M vs 273k records/s). htslib also handles `.crai`
//! indexing natively. SAM and BAM remain on noodles unchanged.
//!
//! htslib's textual SAM header is parsed into a `noodles::sam::Header` at
//! open time so every collector and helper sees the same `Header` type
//! regardless of format.

use std::fs::File;
use std::io::{BufReader, Cursor, Read};
use std::path::{Path, PathBuf};

use anyhow::{Context, Result, bail};
use flate2::read::MultiGzDecoder;
use noodles::core::Region;
use noodles::sam::Header;
use noodles::{bam, sam};
use rust_htslib::bam::Read as HtsRead;

use super::riker_record::{
    AuxTagRequirements, BamRec, FallbackRec, HtslibRec, RikerRecord, RikerRecordRequirements,
};

/// Capacity of the `BufReader` placed between the file and the BGZF decoder
/// on the sequential BAM path. The BGZF reader issues two `read_exact` calls
/// per block (18-byte header + payload up to ~64 KiB); a 256 KiB buffer covers
/// several blocks per refill and keeps the per-byte syscall overhead negligible.
const BAM_READ_BUFFER_BYTES: usize = 256 * 1024;

// ─── AlignmentReader (sequential) ───────────────────────────────────────────

/// Sequential SAM/BAM/CRAM reader. `Send`-safe; used by every command
/// that streams the input front-to-back. Multi's parallel reader thread
/// requires this type because the alternative
/// ([`IndexedAlignmentReader`]) holds a non-`Send` boxed index.
///
/// Open via [`open`](Self::open). For region queries on indexed BAM/CRAM
/// inputs, use [`IndexedAlignmentReader::open`] instead.
pub struct AlignmentReader {
    inner: Inner,
    header: Header,
}

/// Format-specific reader state for [`AlignmentReader`]. One variant per
/// supported sequential format. The CRAM arm is `Box`-ed because
/// `rust_htslib::bam::Reader` is large (a few hundred bytes of fields,
/// dwarfing the noodles BAM/SAM readers); boxing keeps the enum small
/// and avoids per-record copies during reader-handle moves.
enum Inner {
    Sam(sam::io::Reader<BufReader<File>>),
    GzippedSam(Box<sam::io::Reader<BufReader<MultiGzDecoder<File>>>>),
    Bam(bam::io::Reader<noodles_bgzf::io::Reader<BufReader<File>>>),
    Cram(Box<rust_htslib::bam::Reader>),
}

impl AlignmentReader {
    /// Open `path` for streaming reads. Format is detected from the
    /// extension (`.sam`, `.sam.gz`, `.bam`, `.cram`). A reference FASTA
    /// is required for CRAM and ignored for the others.
    ///
    /// # Errors
    /// Returns an error if the file cannot be opened, the header cannot
    /// be read, or a CRAM file is opened without a reference.
    pub fn open(path: &Path, reference: Option<&Path>) -> Result<Self> {
        match detect_format(path)? {
            AlignmentFormat::Sam => {
                let file = File::open(path).with_context(|| open_context(path))?;
                let mut reader = sam::io::Reader::new(BufReader::new(file));
                let header = reader.read_header().with_context(|| header_context(path))?;
                Ok(Self { inner: Inner::Sam(reader), header })
            }
            AlignmentFormat::GzippedSam => {
                let file = File::open(path).with_context(|| open_context(path))?;
                let mut reader = sam::io::Reader::new(BufReader::new(MultiGzDecoder::new(file)));
                let header = reader.read_header().with_context(|| header_context(path))?;
                Ok(Self { inner: Inner::GzippedSam(Box::new(reader)), header })
            }
            AlignmentFormat::Bam => {
                let file = File::open(path).with_context(|| open_context(path))?;
                let buf = BufReader::with_capacity(BAM_READ_BUFFER_BYTES, file);
                let bgzf_reader = noodles_bgzf::io::Reader::new(buf);
                let mut reader = bam::io::Reader::from(bgzf_reader);
                let header = reader.read_header().with_context(|| header_context(path))?;
                Ok(Self { inner: Inner::Bam(reader), header })
            }
            AlignmentFormat::Cram => match reference {
                Some(ref_path) => {
                    let mut reader = rust_htslib::bam::Reader::from_path(path)
                        .with_context(|| open_context(path))?;
                    reader.set_reference(ref_path).with_context(|| {
                        format!(
                            "Failed to set CRAM reference {} on {}",
                            ref_path.display(),
                            path.display()
                        )
                    })?;
                    let header = parse_htslib_header(reader.header().as_bytes())
                        .with_context(|| header_context(path))?;
                    Ok(Self { inner: Inner::Cram(Box::new(reader)), header })
                }
                None => {
                    bail!("CRAM files require a reference FASTA (--reference): {}", path.display())
                }
            },
        }
    }

    /// The parsed header.
    #[must_use]
    pub fn header(&self) -> &Header {
        &self.header
    }

    /// Construct an empty [`RikerRecord`] of the variant this reader
    /// fills on its in-place path. BAM hands back [`RikerRecord::Bam`];
    /// CRAM hands back [`RikerRecord::Htslib`]; SAM and gzipped-SAM hand
    /// back [`RikerRecord::Fallback`]. Used by callers that pre-allocate
    /// a pool of slots before reading.
    #[must_use]
    pub fn empty_record(&self) -> RikerRecord {
        match self.inner {
            Inner::Bam(_) => RikerRecord::Bam(BamRec::new()),
            Inner::Cram(_) => RikerRecord::Htslib(HtslibRec::new()),
            Inner::Sam(_) | Inner::GzippedSam(_) => RikerRecord::Fallback(FallbackRec::empty()),
        }
    }

    /// Read the next record into `slot` in place, decoding only the
    /// fields named in `requirements`. Returns `Ok(true)` on success,
    /// `Ok(false)` at EOF. Works for every format.
    ///
    /// `slot`'s variant must match this reader's format — BAM readers
    /// fill [`RikerRecord::Bam`]; CRAM readers fill
    /// [`RikerRecord::Htslib`]; SAM readers fill
    /// [`RikerRecord::Fallback`]. Use [`empty_record`](Self::empty_record)
    /// to pre-allocate a slot of the correct variant.
    ///
    /// # Errors
    /// Returns an error if the underlying read fails or if the slot's
    /// variant doesn't match the reader's format.
    pub fn fill_record(
        &mut self,
        requirements: &RikerRecordRequirements,
        slot: &mut RikerRecord,
    ) -> Result<bool> {
        match (&mut self.inner, slot) {
            (Inner::Bam(reader), RikerRecord::Bam(bam_rec)) => {
                fill_bam_slot(reader, bam_rec, requirements)
            }
            (Inner::Sam(reader), RikerRecord::Fallback(fb)) => {
                fill_sam_slot(reader, &self.header, fb)
            }
            (Inner::GzippedSam(reader), RikerRecord::Fallback(fb)) => {
                fill_sam_slot(reader, &self.header, fb)
            }
            (Inner::Cram(reader), RikerRecord::Htslib(hts_rec)) => {
                fill_htslib_slot(reader.as_mut(), hts_rec, requirements)
            }
            (Inner::Bam(_), _) => bail!("BAM reader requires a RikerRecord::Bam slot"),
            (Inner::Cram(_), _) => bail!("CRAM reader requires a RikerRecord::Htslib slot"),
            (Inner::Sam(_) | Inner::GzippedSam(_), _) => {
                bail!("SAM reader requires a RikerRecord::Fallback slot")
            }
        }
    }

    /// Returns an iterator that yields one [`RikerRecord`] per record in
    /// the file, allocating a fresh [`BamRec`] / [`FallbackRec`] /
    /// [`HtslibRec`] per yield. Works for every format. Use
    /// [`fill_record`](Self::fill_record) for hot paths instead —
    /// `fill_record` reuses one slot across the whole loop. This
    /// iterator is the right tool for low-volume callers where
    /// per-record allocation isn't the bottleneck.
    pub fn riker_records<'a>(
        &'a mut self,
        requirements: &'a RikerRecordRequirements,
    ) -> Box<dyn Iterator<Item = Result<RikerRecord>> + 'a> {
        let header = &self.header;
        match &mut self.inner {
            Inner::Bam(reader) => Box::new(BamRikerRecordIter { reader, requirements }),
            Inner::Sam(reader) => Box::new(reader.record_bufs(header).map(|result| {
                let buf = result.context("Failed to read SAM record")?;
                Ok(RikerRecord::Fallback(FallbackRec::from_record_buf(buf)))
            })),
            Inner::GzippedSam(reader) => Box::new(reader.record_bufs(header).map(|result| {
                let buf = result.context("Failed to read SAM record")?;
                Ok(RikerRecord::Fallback(FallbackRec::from_record_buf(buf)))
            })),
            Inner::Cram(reader) => {
                Box::new(HtslibRikerRecordIter { reader: reader.as_mut(), requirements })
            }
        }
    }
}

// ─── IndexedAlignmentReader ─────────────────────────────────────────────────

/// Indexed SAM/BAM/CRAM reader supporting region queries. Not `Send` —
/// noodles stores the BAM index as `Box<dyn BinningIndex>`, which lacks
/// a `Send` bound. Use [`AlignmentReader`] for sequential / parallel
/// pipelines; this type is reserved for callers that need
/// [`query`](Self::query) (currently just `error` standalone).
///
/// Open via [`open`](Self::open). Only BAM (`.bai`/`.csi`) and CRAM
/// (`.crai`) are supported; SAM has no widely-deployed index format.
pub struct IndexedAlignmentReader {
    inner: IndexedInner,
    header: Header,
}

/// Format-specific reader state for [`IndexedAlignmentReader`].
enum IndexedInner {
    Bam(bam::io::IndexedReader<noodles_bgzf::io::Reader<File>>),
    Cram(Box<rust_htslib::bam::IndexedReader>),
}

impl IndexedAlignmentReader {
    /// Open `path` with its index, enabling region queries via
    /// [`query`](Self::query). A reference FASTA is required for CRAM
    /// and ignored for BAM.
    ///
    /// # Errors
    /// Returns an error if the file or its index cannot be opened, the
    /// format is not BAM/CRAM, or a CRAM file is opened without a
    /// reference.
    pub fn open(path: &Path, reference: Option<&Path>) -> Result<Self> {
        match detect_format(path)? {
            AlignmentFormat::Sam | AlignmentFormat::GzippedSam => {
                bail!("SAM files cannot be indexed; use a BAM or CRAM file: {}", path.display())
            }
            AlignmentFormat::Bam => {
                validate_bam_index_exists(path)?;
                let mut reader = bam::io::indexed_reader::Builder::default()
                    .build_from_path(path)
                    .with_context(|| format!("Failed to open indexed BAM: {}", path.display()))?;
                let header = reader.read_header().with_context(|| header_context(path))?;
                Ok(Self { inner: IndexedInner::Bam(reader), header })
            }
            AlignmentFormat::Cram => match reference {
                Some(ref_path) => {
                    validate_cram_index_exists(path)?;
                    let mut reader = rust_htslib::bam::IndexedReader::from_path(path)
                        .with_context(|| {
                            format!("Failed to open indexed CRAM: {}", path.display())
                        })?;
                    reader.set_reference(ref_path).with_context(|| {
                        format!(
                            "Failed to set CRAM reference {} on {}",
                            ref_path.display(),
                            path.display()
                        )
                    })?;
                    let header = parse_htslib_header(reader.header().as_bytes())
                        .with_context(|| header_context(path))?;
                    Ok(Self { inner: IndexedInner::Cram(Box::new(reader)), header })
                }
                None => {
                    bail!("CRAM files require a reference FASTA (--reference): {}", path.display())
                }
            },
        }
    }

    /// The parsed header.
    #[must_use]
    pub fn header(&self) -> &Header {
        &self.header
    }

    /// Stream records overlapping `region` to the callback `f`,
    /// applying `requirements` per record. A single scratch
    /// [`RikerRecord`] is reused across the whole region — the cigar /
    /// quality / aux Vecs and (for CRAM) the htslib `bam1_t` buffer are
    /// recycled.
    ///
    /// Callbacks return `Result<()>` so the caller can propagate
    /// errors and abort the scan early.
    ///
    /// # Errors
    /// Returns an error if the underlying query, decode, or callback
    /// fails. The callback's error short-circuits the scan.
    pub fn query_for_each<F>(
        &mut self,
        region: &Region,
        requirements: &RikerRecordRequirements,
        mut f: F,
    ) -> Result<()>
    where
        F: FnMut(&RikerRecord) -> Result<()>,
    {
        let header = &self.header;
        match &mut self.inner {
            IndexedInner::Bam(reader) => {
                let query = reader
                    .query(header, region)
                    .with_context(|| format!("Failed to query BAM for region: {region}"))?;
                // One scratch slot for the whole region — install the
                // next raw record, refresh, then hand a borrow to the
                // callback. Cigar / quality / aux Vecs in the BamRec
                // are reused across iterations.
                let mut record = RikerRecord::Bam(BamRec::new());
                for result in query.records() {
                    let raw = result.context("Failed to read BAM record during query")?;
                    let RikerRecord::Bam(ref mut bam_rec) = record else {
                        unreachable!("scratch was constructed as RikerRecord::Bam");
                    };
                    bam_rec.install(raw)?;
                    apply_requirements(bam_rec, requirements)?;
                    f(&record)?;
                }
            }
            IndexedInner::Cram(reader) => {
                fetch_htslib_region(reader.as_mut(), region)?;
                let mut record = RikerRecord::Htslib(HtslibRec::new());
                // The let-else inside the loop body looks redundant
                // (scratch is always `Htslib`), but pulling the binding
                // outside the loop would extend the `&mut hts_rec`
                // borrow across `f(&record)` at the bottom and conflict
                // with the immutable borrow there. Re-binding per
                // iteration keeps each borrow scoped tightly enough
                // that both can coexist.
                loop {
                    let RikerRecord::Htslib(ref mut hts_rec) = record else {
                        unreachable!("scratch was constructed as RikerRecord::Htslib");
                    };
                    if !hts_rec.read_from(reader.as_mut())? {
                        break;
                    }
                    apply_requirements_htslib(hts_rec, requirements)?;
                    f(&record)?;
                }
            }
        }
        Ok(())
    }
}

// ─── Module-level helpers ───────────────────────────────────────────────────

/// BAM-side iterator yielding [`RikerRecord::Bam`] entries from a
/// sequential reader, with the caller's requirements applied.
struct BamRikerRecordIter<'a, R: Read> {
    reader: &'a mut bam::io::Reader<R>,
    requirements: &'a RikerRecordRequirements,
}

impl<R: Read> Iterator for BamRikerRecordIter<'_, R> {
    type Item = Result<RikerRecord>;

    fn next(&mut self) -> Option<Self::Item> {
        let mut bam_rec = BamRec::new();
        match bam_rec.read_from(self.reader) {
            Ok(0) => None,
            Ok(_) => match apply_requirements(&mut bam_rec, self.requirements) {
                Ok(()) => Some(Ok(RikerRecord::Bam(bam_rec))),
                Err(e) => Some(Err(e)),
            },
            Err(e) => Some(Err(e)),
        }
    }
}

/// CRAM-side iterator yielding [`RikerRecord::Htslib`] entries from a
/// rust-htslib reader. Allocates one [`HtslibRec`] per yield; for hot
/// paths use [`AlignmentReader::fill_record`] which reuses a single slot.
struct HtslibRikerRecordIter<'a> {
    reader: &'a mut rust_htslib::bam::Reader,
    requirements: &'a RikerRecordRequirements,
}

impl Iterator for HtslibRikerRecordIter<'_> {
    type Item = Result<RikerRecord>;

    fn next(&mut self) -> Option<Self::Item> {
        let mut hts_rec = HtslibRec::new();
        match hts_rec.read_from(self.reader) {
            Ok(false) => None,
            Ok(true) => match apply_requirements_htslib(&mut hts_rec, self.requirements) {
                Ok(()) => Some(Ok(RikerRecord::Htslib(hts_rec))),
                Err(e) => Some(Err(e)),
            },
            Err(e) => Some(Err(e)),
        }
    }
}

/// Read one BAM record into `bam_rec` from a sequential reader and
/// apply `requirements`.
fn fill_bam_slot<R: Read>(
    reader: &mut bam::io::Reader<R>,
    bam_rec: &mut BamRec,
    requirements: &RikerRecordRequirements,
) -> Result<bool> {
    let n = bam_rec.read_from(reader)?;
    if n == 0 {
        return Ok(false);
    }
    apply_requirements(bam_rec, requirements)?;
    Ok(true)
}

/// Read one SAM record into a [`FallbackRec`] in place and refresh the
/// cached scalars. Generic over the SAM reader's underlying byte
/// source so the same body covers plain `BufReader<File>` and the
/// gzip-wrapped variant.
///
/// On `read_record_buf` error the inner `RecordBuf` may be partially
/// overwritten, leaving the slot's cached scalars (e.g. `alignment_end`)
/// describing the previous record — refresh first, then propagate the
/// error, so any later observation of the slot sees self-consistent
/// state rather than half-stale data.
fn fill_sam_slot<R: std::io::BufRead>(
    reader: &mut sam::io::Reader<R>,
    header: &Header,
    fb: &mut FallbackRec,
) -> Result<bool> {
    match reader.read_record_buf(header, fb.inner_mut()) {
        Ok(0) => Ok(false),
        Ok(_) => {
            fb.refresh_cache();
            Ok(true)
        }
        Err(e) => {
            fb.refresh_cache();
            Err(anyhow::Error::from(e).context("Failed to read SAM record"))
        }
    }
}

/// Apply caller-declared decoder requirements to a freshly-read
/// [`BamRec`]: decode sequence bytes and/or scan the requested aux tags.
fn apply_requirements(bam_rec: &mut BamRec, requirements: &RikerRecordRequirements) -> Result<()> {
    if requirements.needs_sequence() {
        bam_rec.decode_sequence();
    }
    match requirements.aux_tags() {
        AuxTagRequirements::None => {}
        AuxTagRequirements::Specific { values, presence } => {
            bam_rec.scan_aux_tags(values, presence)?;
        }
        AuxTagRequirements::All => {
            bam_rec.decode_all_aux()?;
        }
    }
    Ok(())
}

/// Read one CRAM record into `hts_rec` and apply `requirements`. Used
/// by the sequential [`AlignmentReader::fill_record`] path. The
/// generic bound makes it callable on either rust-htslib reader type;
/// the indexed path inlines the same two-line body directly inside
/// [`IndexedAlignmentReader::query_for_each`] because it has its own
/// loop shape.
fn fill_htslib_slot<R: HtsRead>(
    reader: &mut R,
    hts_rec: &mut HtslibRec,
    requirements: &RikerRecordRequirements,
) -> Result<bool> {
    if !hts_rec.read_from(reader)? {
        return Ok(false);
    }
    apply_requirements_htslib(hts_rec, requirements)?;
    Ok(true)
}

/// CRAM analogue of [`apply_requirements`]: decode sequence bytes
/// and/or scan the requested aux tags on a freshly-read [`HtslibRec`].
fn apply_requirements_htslib(
    hts_rec: &mut HtslibRec,
    requirements: &RikerRecordRequirements,
) -> Result<()> {
    if requirements.needs_sequence() {
        hts_rec.decode_sequence();
    }
    match requirements.aux_tags() {
        AuxTagRequirements::None => {}
        AuxTagRequirements::Specific { values, presence } => {
            hts_rec.scan_aux_tags(values, presence)?;
        }
        AuxTagRequirements::All => {
            hts_rec.decode_all_aux()?;
        }
    }
    Ok(())
}

/// Translate a noodles `Region` into a fetch on a rust-htslib indexed
/// reader. Uses the contig name (not a noodles-side tid lookup) so the
/// htslib reader resolves the index against its own header view.
fn fetch_htslib_region(
    reader: &mut rust_htslib::bam::IndexedReader,
    region: &Region,
) -> Result<()> {
    use noodles::core::region::Interval;
    use rust_htslib::bam::FetchDefinition;

    let name = region.name();
    let interval: Interval = region.interval();
    // htslib bounds are 0-based half-open `[start, stop)` as `i64`.
    // The unbounded-interval branches use 0 / i64::MAX (start of contig
    // / end of contig); the `try_from` failure branches `.expect()` the
    // conversion because no real genomic position approaches i64::MAX
    // (it would imply contigs >9.2 billion bases). Marking the
    // unreachable case explicitly is clearer than the prior asymmetric
    // `unwrap_or(0)` / `unwrap_or(i64::MAX)`.
    let start: i64 = interval
        .start()
        .map_or(0, |p| i64::try_from(usize::from(p) - 1).expect("genomic position fits in i64"));
    let stop: i64 = interval
        .end()
        .map_or(i64::MAX, |p| i64::try_from(usize::from(p)).expect("genomic position fits in i64"));

    reader
        .fetch(FetchDefinition::RegionString(name, start, stop))
        .with_context(|| format!("Failed to fetch CRAM region: {region}"))
}

/// Parse htslib's textual SAM header bytes into a `noodles::sam::Header`.
/// Done at open time so the rest of the pipeline can keep using the
/// noodles `Header` type — every collector, helper, and `Region` query
/// downstream is already parameterised on it.
fn parse_htslib_header(header_bytes: &[u8]) -> Result<Header> {
    // htslib's `HeaderView::as_bytes()` returns an empty slice when
    // `sam_hdr_str()` returns NULL — i.e. when the CRAM has no parsed
    // header at all. noodles' `read_header()` would silently produce
    // an empty `Header` from those zero bytes; reject it explicitly so
    // a malformed CRAM surfaces as an error instead of an empty
    // reference list that silently mis-attributes records to whatever
    // contig index 0 happens to mean elsewhere.
    if header_bytes.is_empty() {
        bail!("CRAM file has an empty SAM header — refusing to read records against it");
    }
    let mut parser = sam::io::Reader::new(Cursor::new(header_bytes));
    parser.read_header().context("Failed to parse SAM header text from CRAM")
}

/// Detected alignment file format based on extension.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum AlignmentFormat {
    Sam,
    GzippedSam,
    Bam,
    Cram,
}

/// Detect the alignment format from a file extension.
///
/// Recognizes `.sam`, `.sam.gz`, `.bam`, and `.cram` (case-insensitive).
fn detect_format(path: &Path) -> Result<AlignmentFormat> {
    let ext = path.extension().and_then(|e| e.to_str());

    // Check for .sam.gz: extension is "gz" and the stem ends with ".sam"
    if ext.is_some_and(|e| e.eq_ignore_ascii_case("gz")) {
        let stem = path.file_stem().unwrap_or_default();
        if Path::new(stem).extension().is_some_and(|e| e.eq_ignore_ascii_case("sam")) {
            return Ok(AlignmentFormat::GzippedSam);
        }
    }

    match ext {
        Some(e) if e.eq_ignore_ascii_case("sam") => Ok(AlignmentFormat::Sam),
        Some(e) if e.eq_ignore_ascii_case("bam") => Ok(AlignmentFormat::Bam),
        Some(e) if e.eq_ignore_ascii_case("cram") => Ok(AlignmentFormat::Cram),
        _ => bail!(
            "Cannot determine alignment format from extension. \
             Expected .sam, .sam.gz, .bam, or .cram: {}",
            path.display()
        ),
    }
}

/// Returns `path` with `suffix` (e.g. `".bai"`) appended after the
/// existing extension. Used to derive index-file paths.
fn append_extension(path: &Path, suffix: &str) -> PathBuf {
    let mut p = path.as_os_str().to_owned();
    p.push(suffix);
    PathBuf::from(p)
}

/// Confirm that an index for `path` exists in one of the three layouts
/// `samtools` writes:
///
/// - `<file>.bam.bai` (suffix appended) — `samtools index`'s default
/// - `<file>.bai` (extension replaced — sibling) — Picard's default
/// - `<file>.csi` (suffix appended) — large-contig index
///
/// `path.with_extension("bai")` gives the sibling form (replaces the
/// existing `.bam` extension); `append_extension(_, ".bai")` gives the
/// suffix form.
fn validate_bam_index_exists(path: &Path) -> Result<()> {
    let bai_sibling = path.with_extension("bai");
    let bai_suffix = append_extension(path, ".bai");
    let csi = append_extension(path, ".csi");

    if bai_sibling.exists() || bai_suffix.exists() || csi.exists() {
        return Ok(());
    }

    bail!(
        "BAM index not found. Expected one of:\n  {}\n  {}\n  {}\n\
         Run `samtools index {}` to create one.",
        bai_suffix.display(),
        bai_sibling.display(),
        csi.display(),
        path.display(),
    );
}

/// Confirm that an index for the CRAM at `path` exists in either of
/// the two layouts htslib auto-discovers:
///
/// - `<file>.cram.crai` (suffix appended) — `samtools index`'s default
/// - `<file>.crai`      (extension replaced — sibling) — Picard's default
///
/// Mirrors [`validate_bam_index_exists`]'s two-form check; htslib will
/// happily open either, so rejecting one would surface a confusing
/// "CRAM index not found" error for an index htslib would have used.
/// The pre-check exists to give a friendlier error than htslib's
/// generic open failure.
fn validate_cram_index_exists(path: &Path) -> Result<()> {
    let crai_sibling = path.with_extension("crai");
    let crai_suffix = append_extension(path, ".crai");
    if crai_suffix.exists() || crai_sibling.exists() {
        return Ok(());
    }
    bail!(
        "CRAM index not found. Expected one of:\n  {}\n  {}\n\
         Run `samtools index {}` to create one.",
        crai_suffix.display(),
        crai_sibling.display(),
        path.display(),
    );
}

/// Context message for "failed to open" errors.
fn open_context(path: &Path) -> String {
    format!("Failed to open alignment file: {}", path.display())
}

/// Context message for "failed to read header" errors.
fn header_context(path: &Path) -> String {
    format!("Failed to read header from: {}", path.display())
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::fs;
    use tempfile::TempDir;

    /// Helper: create an empty file at `path`.
    fn touch(path: &Path) {
        fs::write(path, b"").expect("create empty file");
    }

    #[test]
    fn validate_bam_index_finds_suffix_form() {
        // `samtools index` default: foo.bam → foo.bam.bai
        let dir = TempDir::new().unwrap();
        let bam = dir.path().join("sample.bam");
        touch(&bam);
        touch(&dir.path().join("sample.bam.bai"));
        validate_bam_index_exists(&bam).expect("should accept suffix-form .bam.bai");
    }

    #[test]
    fn validate_bam_index_finds_sibling_form() {
        // Picard default: foo.bam → foo.bai
        let dir = TempDir::new().unwrap();
        let bam = dir.path().join("sample.bam");
        touch(&bam);
        touch(&dir.path().join("sample.bai"));
        validate_bam_index_exists(&bam).expect("should accept sibling-form .bai");
    }

    #[test]
    fn validate_bam_index_finds_csi() {
        let dir = TempDir::new().unwrap();
        let bam = dir.path().join("sample.bam");
        touch(&bam);
        touch(&dir.path().join("sample.bam.csi"));
        validate_bam_index_exists(&bam).expect("should accept .csi");
    }

    #[test]
    fn validate_bam_index_missing_errors() {
        let dir = TempDir::new().unwrap();
        let bam = dir.path().join("sample.bam");
        touch(&bam);
        let err = validate_bam_index_exists(&bam).expect_err("missing index should fail");
        let msg = err.to_string();
        assert!(msg.contains("BAM index not found"), "unexpected error: {msg}");
        assert!(msg.contains("sample.bam.bai"), "should mention suffix form: {msg}");
        assert!(msg.contains("sample.bai"), "should mention sibling form: {msg}");
    }

    #[test]
    fn validate_cram_index_finds_crai() {
        let dir = TempDir::new().unwrap();
        let cram = dir.path().join("sample.cram");
        touch(&cram);
        touch(&dir.path().join("sample.cram.crai"));
        validate_cram_index_exists(&cram).expect("should accept .cram.crai");
    }

    #[test]
    fn validate_cram_index_missing_errors() {
        let dir = TempDir::new().unwrap();
        let cram = dir.path().join("sample.cram");
        touch(&cram);
        let err = validate_cram_index_exists(&cram).expect_err("missing .crai should fail");
        assert!(err.to_string().contains("CRAM index not found"));
    }
}