seqair 0.1.0

Pure-Rust BAM/SAM/CRAM/FASTA reader and pileup engine
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
//! Open and query CRAM files. [`IndexedCramReader`] handles CRAM v3.0/v3.1 including
//! multi-ref slices, embedded references, coordinate clamping, and per-slice MD5 verification.
//! Call [`IndexedCramReader::fork`] for cheap per-thread readers.

use super::{
    super::bam::record_store::CustomizeRecordStore,
    block,
    compression_header::CompressionHeader,
    container::ContainerHeader,
    index::{self, CramIndex, CramIndexError},
    rans::Rans4x8Buf,
    rans_nx16::Nx16Order1Buf,
    slice,
};
use crate::bam::cigar::CigarOp;
use crate::bam::record::DecodeError;
use crate::bam::record_store::RecordStore;
use crate::bam::{BamHeader, BamHeaderError, BgzfError};
use crate::fasta::{FastaError, IndexedFastaReader};
use seqair_types::{Base, Pos0, Pos1, SmallVec, SmolStr};
use std::fs::File;
use std::io::{Read, Seek, SeekFrom};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use tracing::instrument;

#[cfg(feature = "fuzz")]
use std::io::Cursor;

#[non_exhaustive]
#[derive(Debug, thiserror::Error)]
pub enum CramError {
    #[error("I/O error opening {path}")]
    Open { path: PathBuf, source: std::io::Error },

    #[error("invalid CRAM magic (expected 'CRAM', found {found:?})")]
    InvalidMagic { found: [u8; 4] },

    #[error("unsupported CRAM version {major}.{minor}")]
    UnsupportedVersion { major: u8, minor: u8 },

    #[error("CRAM header error")]
    Header {
        #[from]
        source: BamHeaderError,
    },

    #[error("CRAI index not found for {cram_path} (expected .cram.crai or .crai)")]
    IndexNotFound { cram_path: PathBuf },

    #[error(transparent)]
    IndexParse {
        #[from]
        source: CramIndexError,
    },

    #[error("CRC32 mismatch in {context}: expected {expected:#010x}, got {found:#010x}")]
    ChecksumMismatch { context: &'static str, expected: u32, found: u32 },

    #[error(
        "unsupported compression codec {method} in block (content_type={content_type}, \
             content_id={content_id}). Convert to BAM with `samtools view -b`."
    )]
    UnsupportedCodec { method: u8, content_type: u8, content_id: i32 },

    #[error("unsupported encoding type {encoding_id}")]
    UnsupportedEncoding { encoding_id: i32 },

    #[error("truncated CRAM data: {context}")]
    Truncated { context: &'static str },

    #[error("FASTA reference error")]
    Fasta {
        #[from]
        source: FastaError,
    },

    #[error("reference MD5 mismatch for {contig}:{start}-{end}")]
    ReferenceMd5Mismatch { contig: SmolStr, start: u64, end: u64 },

    #[error(
        "missing reference for {contig} and no embedded reference in slice. \
             Consider setting REF_PATH/REF_CACHE for automatic reference download."
    )]
    MissingReference { contig: SmolStr },

    #[error("multi-ref slices not yet supported (Phase 2)")]
    MultiRefNotSupported,

    #[error("external block not found for content_id {content_id}")]
    ExternalBlockNotFound { content_id: i32 },

    // ── Block decoding ───────────────────────────────────────────────────────
    #[error("unknown block content type: {content_type}")]
    UnknownBlockContentType { content_type: u8 },

    #[error("gzip decompression failed in block")]
    GzipDecompressionFailed { source: libdeflater::DecompressionError },

    #[error("bzip2 decompression failed in block")]
    Bzip2DecompressionFailed { source: std::io::Error },

    #[error("lzma decompression failed in block")]
    LzmaDecompressionFailed { source: std::io::Error },

    // ── Block type mismatch ──────────────────────────────────────────────────
    #[error("expected compression header block, got {found:?}")]
    ExpectedCompressionHeader { found: block::ContentType },

    #[error("expected file header block, got {found:?}")]
    ExpectedFileHeader { found: block::ContentType },

    #[error("expected slice header block, got {found:?}")]
    ExpectedSliceHeader { found: block::ContentType },

    // ── Reader-level ─────────────────────────────────────────────────────────
    #[error("unknown target ID {tid}")]
    UnknownTid { tid: u32 },

    #[error("invalid CRAM alignment position {value}: out of valid range")]
    InvalidPosition { value: i64 },

    #[error("file header text is not valid UTF-8")]
    HeaderNotUtf8 { source: std::str::Utf8Error },

    // ── Slice-level ──────────────────────────────────────────────────────────
    #[error("no core data block in slice")]
    MissingCoreDataBlock,

    #[error("unknown CRAM feature code {feature_code:#04x} ('{}')", *feature_code as char)]
    UnknownFeatureCode { feature_code: u8 },

    // ── Encoding-level ───────────────────────────────────────────────────────
    #[error("Huffman alphabet size ({alphabet_size}) != bit_lengths size ({bit_lengths_size})")]
    HuffmanSizeMismatch { alphabet_size: usize, bit_lengths_size: usize },

    #[error("external byte array encoding for content_id={content_id} requires explicit length")]
    ExternalByteArrayNeedsLength { content_id: i32 },

    // ── rANS ─────────────────────────────────────────────────────────────────
    #[error("invalid rANS 4x8 order: {order}")]
    InvalidRansOrder { order: u8 },

    #[error(
        "rANS 4x8 header compressed_size ({advertised}) does not match remaining payload \
         length ({actual}) — file truncated or header corrupt"
    )]
    Rans4x8CompressedSizeMismatch { advertised: u32, actual: usize },

    #[error("rANS Nx16 stripe chunk_count must be > 0")]
    RansStripeZeroChunks,

    #[error("rANS Nx16 bit-pack symbol_count must be > 0")]
    RansBitPackZeroSymbols,

    #[error("rANS Nx16 bit-pack symbol count must be <= 16, got {symbol_count}")]
    RansBitPackTooManySymbols { symbol_count: usize },

    #[error(
        "malformed rANS alphabet/frequency-table run: starts at sym {start} with len {len} \
         (would extend past 255)"
    )]
    MalformedAlphabetRun { start: u8, len: u8 },

    // ── tok3 ─────────────────────────────────────────────────────────────────
    #[error("invalid tok3 token type: {token_type}")]
    InvalidTok3TokenType { token_type: u8 },

    #[error("tok3 adaptive arithmetic coder not supported")]
    Tok3ArithmeticCoderUnsupported,

    #[error("tok3 dup position {dup_pos} out of range")]
    Tok3DupPositionOutOfRange { dup_pos: usize },

    #[error("tok3 Delta token requires Digits predecessor, got token discriminant {found}")]
    Tok3DeltaRequiresDigits { found: u8 },

    #[error("tok3 Delta0 token requires PaddedDigits predecessor, got token discriminant {found}")]
    Tok3Delta0RequiresPaddedDigits { found: u8 },

    #[error("tok3 distance {distance} exceeds name index {name_index}")]
    Tok3DistanceExceedsIndex { distance: usize, name_index: usize },

    #[error("tok3 dup reference index {index} out of range")]
    Tok3DupRefOutOfRange { index: usize },

    #[error("tok3 name_count {count} exceeds limit {limit}")]
    Tok3NameCountExceedsLimit { count: usize, limit: usize },

    // ── Codec safety ─────────────────────────────────────────────────────────
    #[error("uint7 overflow: more than 5 continuation bytes")]
    Uint7Overflow,

    #[error("frequency normalization overflow: sum {sum} cannot be represented in u32")]
    FrequencyNormalizationOverflow { sum: u64 },

    #[error("invalid length from ITF8: {value} is negative")]
    InvalidLength { value: i32 },

    #[error("invalid BAM flags value: {value:#06x}")]
    InvalidBamFlags { value: i32, source: std::num::TryFromIntError },

    #[error("BGZF error")]
    Bgzf {
        #[from]
        source: BgzfError,
    },

    #[error("I/O error: {0}")]
    Io(#[from] std::io::Error),

    #[error("record decode error")]
    RecordDecode {
        #[from]
        source: DecodeError,
    },

    #[error(
        "claimed size {size} exceeds allocation limit ({limit}) in {context} — data may be corrupt"
    )]
    AllocationTooLarge { size: usize, limit: usize, context: &'static str },
}

/// Maximum allocation size (256 MiB) for a single decompressed block or parsed structure.
/// Real CRAM data stays well under this; exceeding it indicates corrupt/malicious input.
pub const MAX_ALLOC_SIZE: usize = 256 * 1024 * 1024;

/// Check that a size is within the allocation limit.
pub(crate) fn check_alloc_size(size: usize, context: &'static str) -> Result<(), CramError> {
    if size > MAX_ALLOC_SIZE {
        return Err(CramError::AllocationTooLarge { size, limit: MAX_ALLOC_SIZE, context });
    }
    Ok(())
}

pub struct CramShared {
    pub index: CramIndex,
    pub header: BamHeader,
    /// Pre-parsed `@RG ID:` values, indexed by RG order in the SAM header.
    /// CRAM records reference read groups by index into this list, so the
    /// alternative — re-scanning `header.header_text()` per record inside
    /// `decode_record` — was the dominant cost in `fetch_into_customized`
    /// profiles. Computed once at `open()` time.
    ///
    /// TODO(perf): this lives on `CramShared` because BAM/SAM keep RG in
    /// raw aux bytes and don't need the indexed lookup. If a future
    /// reader (e.g. SAM-from-CRAM converter) starts needing it too,
    /// promote the parsed list to `BamHeader` itself so every format
    /// shares one canonical cache instead of each opening parser
    /// re-scanning the header text.
    pub read_group_ids: Vec<SmolStr>,
    pub cram_path: PathBuf,
    pub fasta_path: PathBuf,
}

/// Parse `@RG ID:` values from SAM header text in declaration order.
fn parse_read_group_ids(header_text: &str) -> Vec<SmolStr> {
    let mut ids = Vec::new();
    for line in header_text.lines() {
        if !line.starts_with("@RG") {
            continue;
        }
        for field in line.split('\t') {
            if let Some(id) = field.strip_prefix("ID:") {
                ids.push(SmolStr::new(id));
                break;
            }
        }
    }
    ids
}

pub struct IndexedCramReader<R: Read + Seek = File> {
    file: R,
    fasta: IndexedFastaReader<R>,
    shared: Arc<CramShared>,
    // Scratch buffers reused across fetch_into calls. The per-record
    // ones (`name_buf`, `feature_byte_buf`, `cigar_ops_buf`) used to be
    // `Vec::new()` allocated inside `decode_record`, dominating profiles
    // in N-record hot loops. Hoisting them here means the second record
    // onward pays zero alloc cost.
    container_buf: Vec<u8>,
    cigar_buf: Vec<CigarOp>,
    bases_buf: Vec<Base>,
    qual_buf: Vec<u8>,
    aux_buf: Vec<u8>,
    ref_seq_buf: Vec<u8>,
    /// Reused across records: read name (qname) bytes.
    name_buf: Vec<u8>,
    /// Reused across records: per-feature byte payloads (insertion bases,
    /// soft-clip bases, bases-block, quality-block).
    feature_byte_buf: Vec<u8>,
    /// Reused across records: accumulated `(length, op_code)` CIGAR ops
    /// before being packed into BAM-layout `CigarOp`s.
    cigar_ops_buf: Vec<(u32, u8)>,
    /// Reused across blocks: rANS 4x8 order-1 tables (1.25 MB).
    /// Lazily allocated on first order-1 block.
    rans_4x8_buf: Option<Rans4x8Buf>,
    /// Reused across blocks: rANS Nx16 order-1 tables (∼512 KB).
    /// Lazily allocated on first order-1 block.
    nx16_order1_buf: Option<Nx16Order1Buf>,
}

impl<R: Read + Seek> std::fmt::Debug for IndexedCramReader<R> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("IndexedCramReader").field("path", &self.shared.cram_path).finish()
    }
}

impl IndexedCramReader<File> {
    /// Open a CRAM file with a FASTA reference for sequence reconstruction.
    #[instrument(level = "debug", fields(cram = %cram_path.display(), fasta = %fasta_path.display()))]
    pub fn open(cram_path: &Path, fasta_path: &Path) -> Result<Self, CramError> {
        let mut file = File::open(cram_path)
            .map_err(|source| CramError::Open { path: cram_path.to_path_buf(), source })?;

        // Read and verify file definition (26 bytes)
        let mut file_def = [0u8; 26];
        file.read_exact(&mut file_def)?;

        // r[impl cram.file.magic]
        if file_def[..4] != *b"CRAM" {
            return Err(CramError::InvalidMagic {
                found: [file_def[0], file_def[1], file_def[2], file_def[3]],
            });
        }

        // r[impl cram.scope.versions]
        let major = file_def[4];
        let minor = file_def[5];
        if major != 3 {
            return Err(CramError::UnsupportedVersion { major, minor });
        }

        // Read header container
        let header = read_header_container(&mut file)?;
        // r[impl unified.sort_order]
        header.validate_sort_order()?;

        // Find and parse CRAI index
        let crai_path = index::find_crai_path(cram_path)?;
        let cram_index = CramIndex::from_path(&crai_path)?;

        // r[impl cram.scope.reference_required]
        // Open FASTA reader
        let fasta = IndexedFastaReader::open(fasta_path)?;

        let read_group_ids = parse_read_group_ids(header.header_text());

        Ok(IndexedCramReader {
            file,
            fasta,
            shared: Arc::new(CramShared {
                index: cram_index,
                header,
                read_group_ids,
                cram_path: cram_path.to_path_buf(),
                fasta_path: fasta_path.to_path_buf(),
            }),
            container_buf: Vec::new(),
            cigar_buf: Vec::new(),
            bases_buf: Vec::new(),
            qual_buf: Vec::new(),
            aux_buf: Vec::new(),
            ref_seq_buf: Vec::new(),
            name_buf: Vec::new(),
            feature_byte_buf: Vec::new(),
            cigar_ops_buf: Vec::new(),
            rans_4x8_buf: None,
            nx16_order1_buf: None,
        })
    }

    pub fn fork(&self) -> Result<Self, CramError> {
        let file = File::open(&self.shared.cram_path)
            .map_err(|source| CramError::Open { path: self.shared.cram_path.clone(), source })?;
        let fasta = self.fasta.fork()?;
        Ok(IndexedCramReader {
            file,
            fasta,
            shared: Arc::clone(&self.shared),
            container_buf: Vec::new(),
            cigar_buf: Vec::new(),
            bases_buf: Vec::new(),
            qual_buf: Vec::new(),
            aux_buf: Vec::new(),
            ref_seq_buf: Vec::new(),
            name_buf: Vec::new(),
            feature_byte_buf: Vec::new(),
            cigar_ops_buf: Vec::new(),
            rans_4x8_buf: None,
            nx16_order1_buf: None,
        })
    }
}

#[cfg(feature = "fuzz")]
impl IndexedCramReader<Cursor<Vec<u8>>> {
    pub fn from_bytes(
        cram_data: Vec<u8>,
        crai_data: &[u8],
        fasta_gz_data: Vec<u8>,
        fai_contents: &str,
        gzi_data: &[u8],
    ) -> Result<Self, CramError> {
        // Parse the 26-byte file definition
        let mut file_def = [0u8; 26];
        file_def.copy_from_slice(
            cram_data.get(..26).ok_or(CramError::Truncated { context: "CRAM file definition" })?,
        );

        // r[impl cram.file.magic]
        if file_def[..4] != *b"CRAM" {
            return Err(CramError::InvalidMagic {
                found: [file_def[0], file_def[1], file_def[2], file_def[3]],
            });
        }

        // r[impl cram.scope.versions]
        let major = file_def[4];
        let minor = file_def[5];
        if major != 3 {
            return Err(CramError::UnsupportedVersion { major, minor });
        }

        let mut cursor = Cursor::new(cram_data.clone());
        cursor.seek(SeekFrom::Start(26))?;
        let header = read_header_container(&mut cursor)?;
        header.validate_sort_order()?;

        let crai_text =
            std::str::from_utf8(crai_data).map_err(|source| CramError::HeaderNotUtf8 { source })?;
        let cram_index = CramIndex::from_text(crai_text)?;

        let fasta = IndexedFastaReader::from_bytes(fasta_gz_data, fai_contents, gzi_data)?;

        let read_group_ids = parse_read_group_ids(header.header_text());

        Ok(IndexedCramReader {
            file: Cursor::new(cram_data),
            fasta,
            shared: Arc::new(CramShared {
                index: cram_index,
                header,
                read_group_ids,
                cram_path: PathBuf::from("<fuzz>"),
                fasta_path: PathBuf::from("<fuzz>"),
            }),
            container_buf: Vec::new(),
            cigar_buf: Vec::new(),
            bases_buf: Vec::new(),
            qual_buf: Vec::new(),
            aux_buf: Vec::new(),
            ref_seq_buf: Vec::new(),
            name_buf: Vec::new(),
            feature_byte_buf: Vec::new(),
            cigar_ops_buf: Vec::new(),
            rans_4x8_buf: None,
            nx16_order1_buf: None,
        })
    }
}

impl<R: Read + Seek> IndexedCramReader<R> {
    pub fn header(&self) -> &BamHeader {
        &self.shared.header
    }

    // r[impl region_buf.not_cram]
    // r[impl cram.container.region_skip]
    // r[impl cram.perf.slice_granularity]
    // r[impl cram.perf.reference_caching]
    // r[impl cram.perf.codec_overhead]
    /// Fetch records overlapping `[start, end)` (0-based) for the given tid.
    pub fn fetch_into(
        &mut self,
        tid: u32,
        start: Pos0,
        end: Pos0,
        store: &mut RecordStore,
    ) -> Result<usize, CramError> {
        self.fetch_into_customized(tid, start, end, store, &mut ()).map(|c| c.kept)
    }

    // r[impl unified.fetch_into_customized]
    // r[impl cram.fetch_into_customized.push_time]
    /// Customized variant: each record that passes the reader's built-in
    /// overlap/tid/unmapped checks is pushed into the store, with
    /// `customize.filter` consulted at push time. Rejection triggers
    /// the same zero-waste rollback used by BAM/SAM — the slabs are
    /// truncated back to their pre-push lengths. The returned
    /// [`FetchCounts`](crate::reader::FetchCounts) reports `fetched`
    /// (produced by the reader) vs `kept` (survived the filter).
    pub fn fetch_into_customized<E: CustomizeRecordStore>(
        &mut self,
        tid: u32,
        start: Pos0,
        end: Pos0,
        store: &mut RecordStore<E::Extra>,
        customize: &mut E,
    ) -> Result<crate::reader::FetchCounts, CramError> {
        store.clear();

        let start_u64 = start.as_u64();
        let end_u64 = end.as_u64();

        #[expect(
            clippy::cast_possible_wrap,
            reason = "tid bounded by BAM header limits (MAX_REFERENCES = 1M), fits i32"
        )]
        let entries = self.shared.index.query(tid as i32, start_u64, end_u64);
        if entries.is_empty() {
            return Ok(crate::reader::FetchCounts::default());
        }

        let mut fetched_total = 0usize;
        let mut kept_total = 0usize;

        // Group CRAI entries by container_offset → set of slice_offsets that
        // overlap our query. Keys are sorted (BTreeMap) so containers are
        // visited in file order. Slice_offset values are kept in a tiny
        // SmallVec since most containers list 1–4 relevant slices.
        // r[impl cram.index.crai_per_slice]
        let mut wanted: std::collections::BTreeMap<u64, SmallVec<u64, 4>> =
            std::collections::BTreeMap::new();
        for e in &entries {
            let entry = wanted.entry(e.container_offset).or_default();
            if !entry.contains(&e.slice_offset) {
                entry.push(e.slice_offset);
            }
        }

        // Get reference name for FASTA lookup. Borrow into the Arc'd
        // header — the FASTA fetch only needs `&str`, and the cold
        // MissingReference error path is the single place we materialize
        // a `SmolStr`.
        let ref_name: &str =
            self.shared.header.target_name(tid).ok_or(CramError::UnknownTid { tid })?;

        // TODO(perf): container-scoped re-reading and re-parsing.
        //
        // Each `fetch_into` call below re-reads the container bytes off
        // disk into `self.container_buf` (E) and re-parses the
        // compression header — including `tag_dictionary` and
        // `tag_encodings` — fresh every time (D). For multi-segment
        // pileup workflows (Readers::pileup over many segments of the
        // same contig) successive fetches frequently revisit the same
        // container, so this is repeated work.
        //
        // Fix options:
        //
        //   1. One-entry "last container" cache: store
        //      `Option<(container_offset, ContainerHeader, CompressionHeader, Vec<u8>)>`
        //      on the reader. On hit, skip the seek + read + parse and
        //      reuse `container_buf` + `ch`. Trivial to implement,
        //      handles the common case where consecutive segments fall
        //      in the same container. Memory cost: one extra
        //      `container_buf`-sized Vec.
        //
        //   2. Small LRU (size 2-4): same idea but covers the case
        //      where neighbouring segments span 2-3 containers. Picks
        //      up overlapping segments and forked workers iterating
        //      adjacent ranges.
        //
        //   3. Per-tile container plan: when used through
        //      `Readers::pileup`, the outer loop knows the segment
        //      sequence ahead of time. Could pre-group segments by
        //      container at plan time and only fetch each container
        //      once. Largest win, but couples the unified reader to
        //      the segment iterator.
        //
        // Profile signal: look for `block::parse_block`,
        // `CompressionHeader::parse`, and `File::read_exact` near the
        // top of `fetch_into_customized` self-time.
        for (&container_offset, wanted_slices) in &wanted {
            self.file.seek(SeekFrom::Start(container_offset))?;

            // Read container header
            let mut header_buf = [0u8; 1024]; // container headers are typically < 100 bytes
            let bytes_read = self.file.read(&mut header_buf)?;
            let container_header = ContainerHeader::parse(
                header_buf
                    .get(..bytes_read)
                    .ok_or(CramError::Truncated { context: "container header buf" })?,
            )?;

            if container_header.is_eof() {
                continue;
            }

            // Read container data; non-negative checked, so try_from is infallible here.
            let data_len = usize::try_from(container_header.length)
                .map_err(|_| CramError::Truncated { context: "container negative length" })?;
            check_alloc_size(data_len, "container data")?;
            self.container_buf.clear();
            self.container_buf.resize(data_len, 0);
            self.file.seek(SeekFrom::Start(
                container_offset
                    .checked_add(container_header.header_size as u64)
                    .ok_or(CramError::Truncated { context: "container seek offset overflow" })?,
            ))?;
            self.file.read_exact(&mut self.container_buf)?;

            // Parse compression header (first block)
            let (comp_block, _) = block::parse_block(&self.container_buf)?;
            if comp_block.content_type != block::ContentType::CompressionHeader {
                return Err(CramError::ExpectedCompressionHeader {
                    found: comp_block.content_type,
                });
            }
            let ch = CompressionHeader::parse(&comp_block.data)?;

            // Fetch reference sequence for the query tid's range.
            // For multi-ref containers (ref_seq_id=-2), the container's alignment
            // range covers multiple references — use the CRAI entry's range for our tid instead.
            let (ref_start, ref_end_clamped) = if container_header.ref_seq_id == -2 {
                // Multi-ref: find the CRAI entry for this container + our tid
                let crai_entry = entries.iter().find(|e| e.container_offset == container_offset);
                match crai_entry {
                    Some(e) if e.alignment_span > 0 => {
                        let s = Pos1::try_from(e.alignment_start.max(1))
                            .map_err(|_| CramError::InvalidPosition { value: e.alignment_start })?
                            .to_zero_based()
                            .as_u64();
                        let e_end =
                            s.checked_add(e.alignment_span as u64).ok_or(CramError::Truncated {
                                context: "crai alignment end overflow",
                            })?;
                        let ref_len = self.shared.header.target_len(tid).unwrap_or(0);
                        (s, e_end.min(ref_len))
                    }
                    _ => {
                        // No CRAI span info — fetch the full contig
                        let ref_len = self.shared.header.target_len(tid).unwrap_or(0);
                        (start_u64.min(ref_len), end_u64.min(ref_len))
                    }
                }
            } else {
                let ref_start = Pos1::try_from(container_header.alignment_start.max(1))
                    .map_err(|_| CramError::InvalidPosition {
                        value: i64::from(container_header.alignment_start),
                    })?
                    .to_zero_based()
                    .as_u64();
                let ref_end = ref_start
                    .checked_add(container_header.alignment_span as u64)
                    .ok_or(CramError::Truncated { context: "container alignment end overflow" })?;
                let ref_len = self.shared.header.target_len(tid).unwrap_or(0);
                (ref_start, ref_end.min(ref_len))
            };

            self.ref_seq_buf.clear();
            if ref_start < ref_end_clamped {
                // r[impl cram.edge.missing_reference]
                self.fasta
                    .fetch_seq_into(
                        ref_name,
                        Pos0::try_from(ref_start)
                            .map_err(|_| CramError::InvalidPosition {
                            #[expect(
                                clippy::cast_possible_wrap,
                                reason = "error reporting only; value may exceed i64::MAX but is only used for diagnostics"
                            )]
                            value: ref_start as i64,
                        })?,
                        Pos0::try_from(ref_end_clamped).unwrap_or(Pos0::max_value()),
                        &mut self.ref_seq_buf,
                    )
                    .map_err(|e| match &e {
                        FastaError::SequenceNotFound { .. } => {
                            CramError::MissingReference { contig: SmolStr::new(ref_name) }
                        }
                        _ => CramError::from(e),
                    })?;
            }

            // Decode each slice listed by CRAI as overlapping our query.
            // CRAI's `slice_offset` matches the container's landmark value
            // (both are byte offsets from the start of the container's data
            // block to the slice header). Skipping non-listed landmarks
            // avoids decoding slices we know cannot overlap — a strict win
            // for multi-slice / multi-ref containers, no-op otherwise.
            // r[impl cram.index.crai_per_slice]
            for &landmark in &container_header.landmarks {
                let slice_offset = usize::try_from(landmark)
                    .map_err(|_| CramError::InvalidLength { value: landmark })?;
                let landmark_u64 = u64::try_from(landmark)
                    .map_err(|_| CramError::InvalidLength { value: landmark })?;
                if !wanted_slices.contains(&landmark_u64) {
                    continue;
                }

                // r[impl cram.edge.coordinate_clamp]
                let (slice_fetched, slice_kept) = slice::decode_slice(
                    &ch,
                    &self.container_buf,
                    slice_offset,
                    &self.ref_seq_buf,
                    ref_start.cast_signed(),
                    &self.shared.header,
                    &self.shared.read_group_ids,
                    tid,
                    start,
                    end,
                    store,
                    &mut self.cigar_buf,
                    &mut self.bases_buf,
                    &mut self.qual_buf,
                    &mut self.aux_buf,
                    &mut self.name_buf,
                    &mut self.feature_byte_buf,
                    &mut self.cigar_ops_buf,
                    customize,
                    &mut self.rans_4x8_buf,
                    &mut self.nx16_order1_buf,
                )?;
                fetched_total = fetched_total.saturating_add(slice_fetched);
                kept_total = kept_total.saturating_add(slice_kept);
            }
        }

        Ok(crate::reader::FetchCounts { fetched: fetched_total, kept: kept_total })
    }
}

// r[impl cram.file.header_container]
/// Read the file header container and extract the SAM header text.
fn read_header_container<R: Read + Seek>(file: &mut R) -> Result<BamHeader, CramError> {
    // Read enough for the container header (usually < 100 bytes)
    let mut buf = [0u8; 4096];
    let pos = file.stream_position()?;
    let bytes_read = file.read(&mut buf)?;

    let container_header = ContainerHeader::parse(
        buf.get(..bytes_read).ok_or(CramError::Truncated { context: "header container buf" })?,
    )?;

    // Read the container data; non-negative checked above so the cast is safe.
    let data_len = usize::try_from(container_header.length)
        .map_err(|_| CramError::Truncated { context: "header container negative length" })?;
    check_alloc_size(data_len, "header container data")?;
    let mut data = vec![0u8; data_len];
    file.seek(SeekFrom::Start(
        pos.checked_add(container_header.header_size as u64)
            .ok_or(CramError::Truncated { context: "header container seek offset overflow" })?,
    ))?;
    file.read_exact(&mut data)?;

    // Parse the file header block
    let (blk, _) = block::parse_block(&data)?;
    if blk.content_type != block::ContentType::FileHeader {
        return Err(CramError::ExpectedFileHeader { found: blk.content_type });
    }

    // Block data: i32 header_text_length + UTF-8 text
    if blk.data.len() < 4 {
        return Err(CramError::Truncated { context: "file header block data" });
    }
    let text_len_i32 = i32::from_le_bytes(
        blk.data
            .get(..4)
            .ok_or(CramError::Truncated { context: "file header block data" })?
            .try_into()
            .map_err(|_| CramError::Truncated { context: "file header block data" })?,
    );
    let text_len = usize::try_from(text_len_i32)
        .map_err(|_| CramError::InvalidLength { value: text_len_i32 })?;
    let text_end = 4usize
        .checked_add(text_len)
        .ok_or(CramError::Truncated { context: "file header text length overflow" })?;
    let text_bytes =
        blk.data.get(4..text_end).ok_or(CramError::Truncated { context: "file header text" })?;
    let header_text =
        std::str::from_utf8(text_bytes).map_err(|source| CramError::HeaderNotUtf8 { source })?;

    Ok(BamHeader::from_sam_text(header_text)?)
}

#[cfg(test)]
mod tests {
    use super::*;

    fn cram_path() -> &'static Path {
        Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/../../tests/data/test_v30.cram"))
    }

    fn fasta_path() -> &'static Path {
        Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/../../tests/data/test.fasta.gz"))
    }

    #[test]
    fn open_cram_reader() {
        let reader = IndexedCramReader::open(cram_path(), fasta_path()).unwrap();
        assert!(reader.header().target_count() > 0);
    }

    #[test]
    fn fork_cram_reader() {
        let reader = IndexedCramReader::open(cram_path(), fasta_path()).unwrap();
        let forked = reader.fork().unwrap();
        assert_eq!(reader.header().target_count(), forked.header().target_count());
    }

    #[test]
    fn fetch_into_returns_records() {
        let mut reader = IndexedCramReader::open(cram_path(), fasta_path()).unwrap();
        let mut store = RecordStore::new();

        // Fetch from first reference
        let tid = 0;
        let count =
            reader.fetch_into(tid, Pos0::new(0).unwrap(), Pos0::max_value(), &mut store).unwrap();
        assert!(count > 0, "should fetch records from tid={tid}");
    }

    // r[verify cram.edge.missing_reference]
    #[test]
    fn missing_reference_gives_helpful_error() {
        let err = CramError::MissingReference { contig: SmolStr::new("chr19") };
        let msg = format!("{err}");
        assert!(msg.contains("chr19"), "error should mention the contig name");
        assert!(
            msg.contains("REF_PATH") || msg.contains("REF_CACHE"),
            "error should mention REF_PATH/REF_CACHE as alternatives, got: {msg}"
        );
    }

    #[test]
    fn unknown_tid_error_variant() {
        // UnknownTid is returned in fetch_into when target_name(tid) returns None.
        let err = CramError::UnknownTid { tid: 9999 };
        assert!(matches!(err, CramError::UnknownTid { tid: 9999 }));
        let msg = format!("{err}");
        assert!(msg.contains("9999"), "error message should contain the tid: {msg}");
    }

    #[test]
    fn header_not_utf8_from_invalid_bytes() {
        // read_header_container calls std::str::from_utf8 on the header text bytes and
        // returns CramError::HeaderNotUtf8 on failure. We test the construction path
        // directly since read_header_container is private.
        // Build the invalid byte at runtime to avoid the invalid_from_utf8 lint.
        let invalid_byte = 0xFFu8;
        let invalid_utf8 = std::slice::from_ref(&invalid_byte);
        let err = std::str::from_utf8(invalid_utf8).unwrap_err();
        let cram_err = CramError::HeaderNotUtf8 { source: err };
        assert!(matches!(cram_err, CramError::HeaderNotUtf8 { .. }));
        let msg = format!("{cram_err}");
        assert!(
            msg.contains("UTF-8") || msg.contains("utf-8") || msg.contains("valid"),
            "message: {msg}"
        );
    }
}