rosalind-bio 0.1.0

Deterministic, low-memory genomics engine: memory as a verifiable contract (declare → predict → honor → verify) for alignment and variant calling
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
//! Index reader/writer for Rosalind's on-disk reference index format.
//!
//! Single-file, little-endian, section-based (see `docs/index-format.md`). Every
//! multi-byte array is written as little-endian words at an 8-byte-aligned file
//! offset so the reader can reinterpret mmap bytes as `&[u64]`/`&[u32]` with a
//! checked `slice::align_to` (zero-copy, no rebuild). Serialization is
//! deterministic: fixed section order, fixed-width LE encoding, zero padding, no
//! timestamps ⇒ byte-identical across repeated builds of the same genome.

use std::fs::File;
use std::io::{self, BufWriter, Seek, SeekFrom, Write};
use std::path::{Path, PathBuf};

use blake3::Hasher;
use thiserror::Error;

use crate::core::ContigSet;
use crate::genomics::index::format::{
    IndexHeader, IndexVersion, SectionEntry, SectionKind, ROSALIND_INDEX_MAGIC,
};
use crate::genomics::{CompressedDNA, GenomeIndex};
use crate::util::mmap::MmapReadOnly;

/// Errors for index IO.
#[derive(Debug, Error)]
pub enum IndexIoError {
    #[error("io error: {0}")]
    Io(#[from] io::Error),

    #[error("invalid index: {0}")]
    Invalid(String),
}

/// A loaded reference index: a memory-mapped byte buffer plus the parsed header,
/// section table, and contig set. The FM-index query surface is obtained as a
/// borrowed [`crate::genomics::FmIndexView`] via [`ReferenceIndex::view`].
#[derive(Debug)]
pub struct ReferenceIndex {
    /// Path to the index file.
    pub path: PathBuf,
    /// Parsed file header.
    pub header: IndexHeader,
    /// Parsed contig set (names, lengths, global offsets).
    pub(crate) contigs: ContigSet,
    /// Parsed section table.
    pub sections: Vec<SectionEntry>,
    pub(crate) mmap: MmapReadOnly,
}

impl ReferenceIndex {
    /// Return the raw bytes of the memory-mapped index file.
    pub fn bytes(&self) -> &[u8] {
        self.mmap.as_bytes()
    }

    /// The parsed contig set.
    pub fn contigs(&self) -> &ContigSet {
        &self.contigs
    }

    /// Borrow a zero-copy FM-index view over the memory-mapped sections.
    pub fn view(&self) -> Result<crate::genomics::index::view::FmIndexView<'_>, IndexIoError> {
        crate::genomics::index::view::FmIndexView::new(self.mmap.as_bytes(), &self.sections)
    }

    /// Borrow a zero-copy genome view (FM-index + contig set) for `locate_exact`.
    pub fn genome_view(
        &self,
    ) -> Result<crate::genomics::index::view::GenomeIndexView<'_>, IndexIoError> {
        let fm = self.view()?;
        Ok(crate::genomics::index::view::GenomeIndexView::new(
            fm,
            &self.contigs,
        ))
    }

    /// Borrow a zero-copy view over the persisted 2-bit reference (`Reference2bit`).
    pub fn reference_view(
        &self,
    ) -> Result<crate::genomics::index::view::ReferenceView<'_>, IndexIoError> {
        crate::genomics::index::view::ReferenceView::new(self.mmap.as_bytes(), &self.sections)
    }
}

/// Writes a new index file.
#[derive(Debug)]
pub struct IndexWriter {
    file: File,
}

impl IndexWriter {
    /// Create a new index file for writing (overwriting if it exists).
    pub fn create(path: impl AsRef<Path>) -> Result<Self, IndexIoError> {
        let file = File::create(path)?;
        Ok(Self { file })
    }

    /// Serialise a built [`GenomeIndex`] into the deterministic on-disk format.
    ///
    /// Sections are written in a fixed order, each padded to an 8-byte boundary:
    /// `Contigs`, `Reference2bit`, `FmMeta`, `Boundaries`, `Blocks`, `SaSamples`,
    /// then the section table; the header is rewritten last with the section
    /// table's location. `reference_blake3` is the BLAKE3 of the uppercased ASCII
    /// reference (the stable identity used by the B4 stale-index guard).
    pub fn write_genome_index(self, index: &GenomeIndex) -> Result<(), IndexIoError> {
        let fm = index.fm();
        let contigs = index.contigs();
        if contigs.is_empty() {
            return Err(IndexIoError::Invalid(
                "index must contain at least one contig".to_string(),
            ));
        }

        let reference_blake3 = {
            let mut hasher = Hasher::new();
            hasher.update(index.reference());
            *hasher.finalize().as_bytes()
        };
        let sa_sample_rate = u32::try_from(fm.sa_sample_rate())
            .map_err(|_| IndexIoError::Invalid("sa_sample_rate exceeds u32".to_string()))?;
        let contig_count = u32::try_from(contigs.len())
            .map_err(|_| IndexIoError::Invalid("too many contigs (exceeds u32)".to_string()))?;
        let mut header = IndexHeader::new_v1(contig_count, sa_sample_rate, reference_blake3);

        let mut w = BufWriter::new(self.file);

        // Reserve space for the header (rewritten at the end).
        w.write_all(&[0u8; IndexHeader::FIXED_SIZE])?;

        let mut sections: Vec<SectionEntry> = Vec::new();

        // --- Contigs (byte-copy read on load; alignment not required, but we pad
        // every section start to 8 uniformly). ---
        let off = pad_to_8(&mut w)?;
        for contig in contigs.iter() {
            let name = contig.name.as_bytes();
            let name_len = u32::try_from(name.len())
                .map_err(|_| IndexIoError::Invalid("contig name too long".to_string()))?;
            w.write_all(&name_len.to_le_bytes())?;
            w.write_all(name)?;
            w.write_all(&(contig.length as u64).to_le_bytes())?;
            w.write_all(&contig.global_offset.to_le_bytes())?;
        }
        push_section(&mut sections, SectionKind::Contigs, off, &mut w)?;

        // --- Reference2bit: len, data_words, amb_words, then data[u64], amb[u64]. ---
        let off = pad_to_8(&mut w)?;
        let reference_2bit = CompressedDNA::compress(index.reference())
            .map_err(|e| IndexIoError::Invalid(format!("reference compress: {e}")))?;
        w.write_all(&(reference_2bit.len() as u64).to_le_bytes())?;
        w.write_all(&(reference_2bit.words().len() as u64).to_le_bytes())?;
        w.write_all(&(reference_2bit.ambiguity().bits().len() as u64).to_le_bytes())?;
        write_u64_slice(&mut w, reference_2bit.words())?;
        write_u64_slice(&mut w, reference_2bit.ambiguity().bits())?;
        push_section(&mut sections, SectionKind::Reference2bit, off, &mut w)?;

        // --- FmMeta: 5 u64 then c_table[6 u32]. ---
        let off = pad_to_8(&mut w)?;
        w.write_all(&(fm.block_size() as u64).to_le_bytes())?;
        w.write_all(&(fm.len() as u64).to_le_bytes())?;
        w.write_all(&(fm.sentinel_position() as u64).to_le_bytes())?;
        w.write_all(&(fm.sa_sample_rate() as u64).to_le_bytes())?;
        w.write_all(&(fm.num_blocks() as u64).to_le_bytes())?;
        for v in *fm.c_table() {
            w.write_all(&v.to_le_bytes())?;
        }
        push_section(&mut sections, SectionKind::FmMeta, off, &mut w)?;

        // --- Boundaries: num_blocks+1 entries of [cumulative_counts; 5] + sentinel. ---
        let off = pad_to_8(&mut w)?;
        debug_assert_eq!(fm.boundaries().len(), fm.num_blocks() + 1);
        for boundary in fm.boundaries().iter() {
            for c in boundary.cumulative_counts {
                w.write_all(&c.to_le_bytes())?;
            }
            w.write_all(&boundary.sentinel_count.to_le_bytes())?;
        }
        push_section(&mut sections, SectionKind::Boundaries, off, &mut w)?;

        // --- Blocks: directory of num_blocks u64 record-offsets, then records. ---
        let off = pad_to_8(&mut w)?;
        write_blocks_section(&mut w, fm)?;
        push_section(&mut sections, SectionKind::Blocks, off, &mut w)?;

        // --- SaSamples: rate, bwt_len, marks_words, sb_len, values_len, then arrays. ---
        let off = pad_to_8(&mut w)?;
        let sampled = fm.sampled();
        w.write_all(&(sampled.rate() as u64).to_le_bytes())?;
        w.write_all(&(sampled.len() as u64).to_le_bytes())?;
        w.write_all(&(sampled.marks().len() as u64).to_le_bytes())?;
        w.write_all(&(sampled.superblocks().len() as u64).to_le_bytes())?;
        w.write_all(&(sampled.values().len() as u64).to_le_bytes())?;
        write_u64_slice(&mut w, sampled.marks())?;
        write_u32_slice(&mut w, sampled.superblocks())?;
        write_u32_slice(&mut w, sampled.values())?;
        push_section(&mut sections, SectionKind::SaSamples, off, &mut w)?;

        // --- Section table, then rewrite the header. ---
        let section_table_offset = pad_to_8(&mut w)?;
        write_section_table(&mut w, &sections)?;
        let section_table_end = w.stream_position()?;

        header.section_table_offset = section_table_offset;
        header.section_table_bytes = section_table_end - section_table_offset;

        w.seek(SeekFrom::Start(0))?;
        write_header(&mut w, &header)?;
        w.flush()?;
        Ok(())
    }
}

/// Pad `w` with zero bytes up to the next 8-byte boundary; return the (aligned)
/// current offset.
fn pad_to_8(w: &mut (impl Write + Seek)) -> Result<u64, IndexIoError> {
    let pos = w.stream_position()?;
    let rem = pos % 8;
    if rem == 0 {
        return Ok(pos);
    }
    let pad = (8 - rem) as usize;
    w.write_all(&[0u8; 8][..pad])?;
    Ok(pos + pad as u64)
}

/// Record a section spanning `[offset, current_position)`.
fn push_section(
    sections: &mut Vec<SectionEntry>,
    kind: SectionKind,
    offset: u64,
    w: &mut (impl Write + Seek),
) -> Result<(), IndexIoError> {
    let end = w.stream_position()?;
    sections.push(SectionEntry {
        kind,
        offset,
        bytes: end - offset,
    });
    Ok(())
}

fn write_u64_slice(w: &mut impl Write, words: &[u64]) -> Result<(), IndexIoError> {
    for &word in words {
        w.write_all(&word.to_le_bytes())?;
    }
    Ok(())
}

fn write_u32_slice(w: &mut impl Write, words: &[u32]) -> Result<(), IndexIoError> {
    for &word in words {
        w.write_all(&word.to_le_bytes())?;
    }
    Ok(())
}

/// Write the `Blocks` section: a `num_blocks × u64` directory of record offsets
/// (relative to the section start, each 8-aligned), then one self-describing
/// record per block. Two passes — sizes first (to lay out the directory), then
/// the records — so the writer streams without materialising the whole section.
fn write_blocks_section(
    w: &mut impl Write,
    fm: &crate::genomics::BlockedFMIndex,
) -> Result<(), IndexIoError> {
    let num_blocks = fm.num_blocks();
    let dir_bytes = (num_blocks as u64) * 8;

    // Pass 1: record offsets (relative to section start).
    let mut offsets = Vec::with_capacity(num_blocks);
    let mut cursor = dir_bytes;
    for block in fm.blocks() {
        offsets.push(cursor);
        cursor += block_record_len(block);
    }

    // Directory.
    for &o in &offsets {
        w.write_all(&o.to_le_bytes())?;
    }

    // Pass 2: records.
    for block in fm.blocks() {
        write_block_record(w, block)?;
    }
    Ok(())
}

/// Byte length of one block record (header + payload), padded to 8.
///
/// All five occ rank bitvectors share one length, and all five superblock arrays
/// share one length (`RankSelectIndex::build` allocates them uniformly). This is
/// load-bearing: the directory offsets computed here must match the bytes
/// `write_block_record` emits, which also use `[0].len()` for all five.
fn block_record_len(block: &crate::genomics::BWTBlock) -> u64 {
    let bwt_data_words = block.bwt().words().len() as u64;
    let bwt_amb_words = block.bwt().ambiguity().bits().len() as u64;
    let occ_bitvec_words = block.occ().bitvectors()[0].len() as u64;
    let occ_superblock_len = block.occ().superblocks()[0].len() as u64;
    debug_assert!(
        block
            .occ()
            .bitvectors()
            .iter()
            .all(|bv| bv.len() as u64 == occ_bitvec_words),
        "occ rank bitvectors must share one length (serialization invariant)"
    );
    debug_assert!(
        block
            .occ()
            .superblocks()
            .iter()
            .all(|sb| sb.len() as u64 == occ_superblock_len),
        "occ superblock arrays must share one length (serialization invariant)"
    );
    let body = 64 // 8 header u64/i64 fields
        + 8 * bwt_data_words
        + 8 * bwt_amb_words
        + 8 * 5 * occ_bitvec_words
        + 4 * 5 * occ_superblock_len
        + 4 * 5; // totals[5]
                 // Round up to a multiple of 8. `(body + 7) / 8 * 8`, not `body.div_ceil(8) * 8`
                 // (div_ceil is Rust 1.73+; MSRV is 1.72).
    body.div_ceil(8) * 8
}

/// Write one block record: 8 header fields (`start,end,sentinel_offset,stride,
/// bwt_data_words,bwt_amb_words,occ_bitvec_words,occ_superblock_len`), then
/// `bwt_data[u64]`, `bwt_amb[u64]`, 5×`occ_bitvec[u64]`, 5×`occ_superblock[u32]`,
/// `totals[u32;5]`, padded to 8.
fn write_block_record(
    w: &mut impl Write,
    block: &crate::genomics::BWTBlock,
) -> Result<(), IndexIoError> {
    let bwt = block.bwt();
    let occ = block.occ();
    let bwt_data_words = bwt.words().len() as u64;
    let bwt_amb_words = bwt.ambiguity().bits().len() as u64;
    let occ_bitvec_words = occ.bitvectors()[0].len() as u64;
    let occ_superblock_len = occ.superblocks()[0].len() as u64;

    let sentinel_offset: i64 = match block.sentinel_offset() {
        Some(o) => o as i64,
        None => -1,
    };

    w.write_all(&(block.start() as u64).to_le_bytes())?;
    w.write_all(&(block.end() as u64).to_le_bytes())?;
    w.write_all(&sentinel_offset.to_le_bytes())?;
    w.write_all(&(occ.stride() as u64).to_le_bytes())?;
    w.write_all(&bwt_data_words.to_le_bytes())?;
    w.write_all(&bwt_amb_words.to_le_bytes())?;
    w.write_all(&occ_bitvec_words.to_le_bytes())?;
    w.write_all(&occ_superblock_len.to_le_bytes())?;

    write_u64_slice(w, bwt.words())?;
    write_u64_slice(w, bwt.ambiguity().bits())?;
    for bv in occ.bitvectors() {
        write_u64_slice(w, bv)?;
    }
    for sb in occ.superblocks() {
        write_u32_slice(w, sb)?;
    }
    write_u32_slice(w, &occ.totals())?;

    // Pad the record to an 8-byte boundary (the u32 arrays may leave a 4-byte tail).
    let body = 64
        + 8 * bwt_data_words
        + 8 * bwt_amb_words
        + 8 * 5 * occ_bitvec_words
        + 4 * 5 * occ_superblock_len
        + 4 * 5;
    // Pad to a multiple of 8. `(body + 7) / 8 * 8`, not `div_ceil` (Rust 1.73+; MSRV 1.72).
    let pad = (body.div_ceil(8) * 8 - body) as usize;
    if pad != 0 {
        w.write_all(&[0u8; 8][..pad])?;
    }
    Ok(())
}

/// Reads an existing index file.
#[derive(Debug)]
pub struct IndexReader;

impl IndexReader {
    /// Open and memory-map an existing index file, parsing the header, section
    /// table, and contig set, and validating every section's extent + 8-alignment.
    /// The FM query surface is obtained via [`ReferenceIndex::view`].
    pub fn open(path: impl AsRef<Path>) -> Result<ReferenceIndex, IndexIoError> {
        let path = path.as_ref().to_path_buf();
        let file = File::open(&path)?;
        let mmap = MmapReadOnly::map(&file)?;
        let bytes = mmap.as_bytes();

        let header = read_header(bytes)?;
        validate_header(&header)?;

        let version = IndexVersion::from_u16(header.version).ok_or_else(|| {
            IndexIoError::Invalid(format!("unsupported index version {}", header.version))
        })?;
        if version != IndexVersion::V1 {
            return Err(IndexIoError::Invalid(format!(
                "unsupported index version {version:?}"
            )));
        }

        let sections = read_section_table(bytes, &header)?;
        validate_sections(bytes, &sections)?;
        let contigs = read_contigs(bytes, &header, &sections)?;

        // Eagerly validate the FM-index sections (block directory offsets + declared
        // word-counts) so open() is the single rejection point for corrupt files.
        // The view is dropped immediately; queries use ReferenceIndex::view().
        crate::genomics::index::view::FmIndexView::new(bytes, &sections)?;

        Ok(ReferenceIndex {
            path,
            header,
            contigs,
            sections,
            mmap,
        })
    }
}

/// Validate that every section lies within the file and starts 8-aligned.
fn validate_sections(bytes: &[u8], sections: &[SectionEntry]) -> Result<(), IndexIoError> {
    for s in sections {
        if s.offset % 8 != 0 {
            return Err(IndexIoError::Invalid(format!(
                "section {:?} offset {} is not 8-aligned",
                s.kind, s.offset
            )));
        }
        let end = s
            .offset
            .checked_add(s.bytes)
            .ok_or_else(|| IndexIoError::Invalid("section extent overflow".to_string()))?;
        if end as usize > bytes.len() {
            return Err(IndexIoError::Invalid(format!(
                "section {:?} out of bounds",
                s.kind
            )));
        }
    }
    Ok(())
}

/// Parse the `Contigs` section into a `ContigSet`, recomputing global offsets via
/// `push` and asserting they match the stored offsets (an integrity check).
fn read_contigs(
    bytes: &[u8],
    header: &IndexHeader,
    sections: &[SectionEntry],
) -> Result<ContigSet, IndexIoError> {
    let section = sections
        .iter()
        .find(|s| s.kind == SectionKind::Contigs)
        .ok_or_else(|| IndexIoError::Invalid("missing contigs section".to_string()))?;
    let mut offset = section.offset as usize;
    // `validate_sections` already proved `offset + bytes <= file len`, so this
    // addition cannot overflow on the 64-bit targets Rosalind supports.
    let end = section.offset as usize + section.bytes as usize;

    let mut contigs = ContigSet::new();
    for _ in 0..header.contig_count {
        let name_len = read_u32(bytes, &mut offset)? as usize;
        if offset + name_len > end {
            return Err(IndexIoError::Invalid(
                "contig name out of bounds".to_string(),
            ));
        }
        let name = std::str::from_utf8(&bytes[offset..offset + name_len])
            .map_err(|_| IndexIoError::Invalid("contig name not valid utf-8".to_string()))?
            .to_string();
        offset += name_len;
        // The two u64 fields (length, global_offset) must lie within this section,
        // not spill into the next one (`read_u64` only bounds-checks against the
        // whole file). Reject a record that would cross the section boundary.
        if offset + 16 > end {
            return Err(IndexIoError::Invalid(
                "contig record crosses the section boundary".to_string(),
            ));
        }
        let length = read_u64(bytes, &mut offset)?;
        let stored_global = read_u64(bytes, &mut offset)?;

        let length_u32 = u32::try_from(length)
            .map_err(|_| IndexIoError::Invalid("contig length exceeds u32".to_string()))?;
        let id = contigs.push(name, length_u32);
        let recomputed = contigs.by_id(id).expect("just pushed").global_offset;
        if recomputed != stored_global {
            return Err(IndexIoError::Invalid(format!(
                "contig {id} global_offset {stored_global} != recomputed {recomputed}"
            )));
        }
    }
    Ok(contigs)
}

fn validate_header(header: &IndexHeader) -> Result<(), IndexIoError> {
    if header.magic != ROSALIND_INDEX_MAGIC {
        return Err(IndexIoError::Invalid("bad magic bytes".to_string()));
    }
    if header.endian != IndexHeader::ENDIAN_LITTLE {
        return Err(IndexIoError::Invalid(
            "index endian mismatch (expected little-endian)".to_string(),
        ));
    }
    if header.header_bytes as usize != IndexHeader::FIXED_SIZE {
        return Err(IndexIoError::Invalid(format!(
            "unexpected header size {}",
            header.header_bytes
        )));
    }
    if header.contig_count == 0 {
        return Err(IndexIoError::Invalid(
            "index must contain at least one contig".to_string(),
        ));
    }
    if header.section_table_offset == 0 {
        return Err(IndexIoError::Invalid(
            "missing section table offset".to_string(),
        ));
    }
    Ok(())
}

fn read_header(bytes: &[u8]) -> Result<IndexHeader, IndexIoError> {
    if bytes.len() < IndexHeader::FIXED_SIZE {
        return Err(IndexIoError::Invalid(
            "file too small for header".to_string(),
        ));
    }
    let mut offset = 0usize;
    let mut magic = [0u8; 8];
    magic.copy_from_slice(&bytes[offset..offset + 8]);
    offset += 8;

    let version = read_u16(bytes, &mut offset)?;
    let endian = bytes[offset];
    offset += 1;
    let reserved0 = bytes[offset];
    offset += 1;
    let flags = read_u32(bytes, &mut offset)?;
    let contig_count = read_u32(bytes, &mut offset)?;
    let sa_sample_rate = read_u32(bytes, &mut offset)?;
    let header_bytes = read_u64(bytes, &mut offset)?;
    let section_table_offset = read_u64(bytes, &mut offset)?;
    let section_table_bytes = read_u64(bytes, &mut offset)?;
    let mut reference_blake3 = [0u8; 32];
    reference_blake3.copy_from_slice(&bytes[offset..offset + 32]);

    Ok(IndexHeader {
        magic,
        version,
        endian,
        reserved0,
        flags,
        contig_count,
        sa_sample_rate,
        header_bytes,
        section_table_offset,
        section_table_bytes,
        reference_blake3,
    })
}

fn write_header(mut w: impl Write, header: &IndexHeader) -> Result<(), IndexIoError> {
    w.write_all(&header.magic)?;
    w.write_all(&header.version.to_le_bytes())?;
    w.write_all(&[header.endian])?;
    w.write_all(&[header.reserved0])?;
    w.write_all(&header.flags.to_le_bytes())?;
    w.write_all(&header.contig_count.to_le_bytes())?;
    w.write_all(&header.sa_sample_rate.to_le_bytes())?;
    w.write_all(&header.header_bytes.to_le_bytes())?;
    w.write_all(&header.section_table_offset.to_le_bytes())?;
    w.write_all(&header.section_table_bytes.to_le_bytes())?;
    w.write_all(&header.reference_blake3)?;
    Ok(())
}

fn write_section_table(mut w: impl Write, sections: &[SectionEntry]) -> Result<(), IndexIoError> {
    let count: u32 = sections
        .len()
        .try_into()
        .map_err(|_| IndexIoError::Invalid("too many sections".to_string()))?;
    w.write_all(&count.to_le_bytes())?;
    for section in sections {
        let kind_u32 = section.kind as u32;
        w.write_all(&kind_u32.to_le_bytes())?;
        w.write_all(&section.offset.to_le_bytes())?;
        w.write_all(&section.bytes.to_le_bytes())?;
    }
    Ok(())
}

fn read_section_table(
    bytes: &[u8],
    header: &IndexHeader,
) -> Result<Vec<SectionEntry>, IndexIoError> {
    let mut offset = header.section_table_offset as usize;
    let end = offset
        .checked_add(header.section_table_bytes as usize)
        .ok_or_else(|| IndexIoError::Invalid("section table overflow".to_string()))?;
    if end > bytes.len() {
        return Err(IndexIoError::Invalid(
            "section table out of bounds".to_string(),
        ));
    }
    let count = read_u32(bytes, &mut offset)? as usize;
    let mut sections = Vec::with_capacity(count);
    for _ in 0..count {
        let kind_raw = read_u32(bytes, &mut offset)?;
        let kind = SectionKind::from_u32(kind_raw)
            .ok_or_else(|| IndexIoError::Invalid(format!("unknown section kind {}", kind_raw)))?;
        let sec_offset = read_u64(bytes, &mut offset)?;
        let sec_bytes = read_u64(bytes, &mut offset)?;
        sections.push(SectionEntry {
            kind,
            offset: sec_offset,
            bytes: sec_bytes,
        });
    }
    Ok(sections)
}

fn read_u16(bytes: &[u8], offset: &mut usize) -> Result<u16, IndexIoError> {
    if *offset + 2 > bytes.len() {
        return Err(IndexIoError::Invalid("unexpected EOF".to_string()));
    }
    let mut buf = [0u8; 2];
    buf.copy_from_slice(&bytes[*offset..*offset + 2]);
    *offset += 2;
    Ok(u16::from_le_bytes(buf))
}

fn read_u32(bytes: &[u8], offset: &mut usize) -> Result<u32, IndexIoError> {
    if *offset + 4 > bytes.len() {
        return Err(IndexIoError::Invalid("unexpected EOF".to_string()));
    }
    let mut buf = [0u8; 4];
    buf.copy_from_slice(&bytes[*offset..*offset + 4]);
    *offset += 4;
    Ok(u32::from_le_bytes(buf))
}

fn read_u64(bytes: &[u8], offset: &mut usize) -> Result<u64, IndexIoError> {
    if *offset + 8 > bytes.len() {
        return Err(IndexIoError::Invalid("unexpected EOF".to_string()));
    }
    let mut buf = [0u8; 8];
    buf.copy_from_slice(&bytes[*offset..*offset + 8]);
    *offset += 8;
    Ok(u64::from_le_bytes(buf))
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::genomics::index::format::SectionKind;
    use crate::genomics::{BaseCode, BlockedFMIndex, FmSymbol, GenomeIndex};
    use std::env;
    use std::time::{SystemTime, UNIX_EPOCH};

    fn temp_path(suffix: &str) -> PathBuf {
        let nanos = SystemTime::now()
            .duration_since(UNIX_EPOCH)
            .unwrap()
            .as_nanos();
        env::temp_dir().join(format!("rosalind-b3b-{suffix}-{nanos}.idx"))
    }

    fn sample_index() -> GenomeIndex {
        GenomeIndex::from_named_sequences(&[
            ("chr1".to_string(), b"ACGTACGTNNACGTACG".to_vec()),
            ("chr2".to_string(), b"TTTTGGGGCCCCAAAANNNN".to_vec()),
        ])
        .expect("build")
    }

    #[test]
    fn serialized_index_is_byte_identical_across_builds() {
        let idx = sample_index();
        let p1 = temp_path("det1");
        let p2 = temp_path("det2");
        IndexWriter::create(&p1)
            .unwrap()
            .write_genome_index(&idx)
            .unwrap();
        IndexWriter::create(&p2)
            .unwrap()
            .write_genome_index(&idx)
            .unwrap();
        let a = std::fs::read(&p1).unwrap();
        let b = std::fs::read(&p2).unwrap();
        assert_eq!(
            a, b,
            "two serializations of the same genome must be byte-identical"
        );
        let _ = std::fs::remove_file(p1);
        let _ = std::fs::remove_file(p2);
    }

    #[test]
    fn open_parses_header_contigs_and_sections() {
        let idx = sample_index();
        let path = temp_path("open");
        IndexWriter::create(&path)
            .unwrap()
            .write_genome_index(&idx)
            .unwrap();

        let loaded = IndexReader::open(&path).expect("open");
        assert_eq!(loaded.header.magic, ROSALIND_INDEX_MAGIC);

        // Contigs round-trip with names, lengths, and global offsets.
        let contigs = loaded.contigs();
        assert_eq!(contigs.len(), 2);
        assert_eq!(contigs.by_name("chr1").unwrap().length, 17);
        assert_eq!(contigs.by_name("chr2").unwrap().global_offset, 17);

        // reference_blake3 matches the uppercased ASCII reference.
        let mut hasher = blake3::Hasher::new();
        hasher.update(idx.reference());
        assert_eq!(
            *hasher.finalize().as_bytes(),
            loaded.header.reference_blake3
        );

        // The FM sections are all present.
        for kind in [
            SectionKind::FmMeta,
            SectionKind::Boundaries,
            SectionKind::Blocks,
            SectionKind::SaSamples,
        ] {
            assert!(
                loaded.sections.iter().any(|s| s.kind == kind),
                "missing {kind:?}"
            );
        }
        let _ = std::fs::remove_file(path);
    }

    #[test]
    fn open_rejects_a_truncated_index() {
        let idx = sample_index();
        let path = temp_path("trunc");
        IndexWriter::create(&path)
            .unwrap()
            .write_genome_index(&idx)
            .unwrap();
        let mut bytes = std::fs::read(&path).unwrap();
        bytes.truncate(bytes.len() / 2);
        let truncated = temp_path("trunc-half");
        std::fs::write(&truncated, &bytes).unwrap();
        assert!(IndexReader::open(&truncated).is_err());
        let _ = std::fs::remove_file(path);
        let _ = std::fs::remove_file(truncated);
    }

    #[test]
    fn open_rejects_bad_magic() {
        let idx = sample_index();
        let path = temp_path("magic");
        IndexWriter::create(&path)
            .unwrap()
            .write_genome_index(&idx)
            .unwrap();
        let mut bytes = std::fs::read(&path).unwrap();
        bytes[0] = b'X';
        let corrupt = temp_path("magic-bad");
        std::fs::write(&corrupt, &bytes).unwrap();
        assert!(IndexReader::open(&corrupt).is_err());
        let _ = std::fs::remove_file(path);
        let _ = std::fs::remove_file(corrupt);
    }

    #[test]
    fn serialized_sections_are_present_aligned_and_in_bounds() {
        let idx = sample_index();
        let path = temp_path("sections");
        IndexWriter::create(&path)
            .unwrap()
            .write_genome_index(&idx)
            .unwrap();
        let bytes = std::fs::read(&path).unwrap();

        let header = read_header(&bytes).unwrap();
        validate_header(&header).unwrap();
        let sections = read_section_table(&bytes, &header).unwrap();

        for kind in [
            SectionKind::Contigs,
            SectionKind::Reference2bit,
            SectionKind::FmMeta,
            SectionKind::Boundaries,
            SectionKind::Blocks,
            SectionKind::SaSamples,
        ] {
            let s = sections
                .iter()
                .find(|s| s.kind == kind)
                .unwrap_or_else(|| panic!("missing section {kind:?}"));
            assert_eq!(s.offset % 8, 0, "{kind:?} not 8-aligned");
            let end = s.offset + s.bytes;
            assert!(end as usize <= bytes.len(), "{kind:?} out of bounds");
        }
        let _ = std::fs::remove_file(path);
    }

    #[test]
    fn open_rejects_a_corrupt_block_directory() {
        // A file whose section extents are valid but whose first block-record
        // offset is corrupt must be rejected at open() — never silently mis-read.
        let idx = sample_index();
        let path = temp_path("corrupt-blockdir");
        IndexWriter::create(&path)
            .unwrap()
            .write_genome_index(&idx)
            .unwrap();
        let mut bytes = std::fs::read(&path).unwrap();

        // Locate the Blocks section via the parsed section table, then overwrite
        // the first directory entry (u64 at the section start) with a huge offset.
        let header = read_header(&bytes).unwrap();
        let sections = read_section_table(&bytes, &header).unwrap();
        let blocks = sections
            .iter()
            .find(|s| s.kind == SectionKind::Blocks)
            .expect("blocks section");
        let dir0 = blocks.offset as usize; // first u64 of the directory
        bytes[dir0..dir0 + 8].copy_from_slice(&u64::MAX.to_le_bytes());

        let corrupt = temp_path("corrupt-blockdir-bad");
        std::fs::write(&corrupt, &bytes).unwrap();
        assert!(
            IndexReader::open(&corrupt).is_err(),
            "open must reject a block directory whose record offset is out of bounds"
        );
        let _ = std::fs::remove_file(path);
        let _ = std::fs::remove_file(corrupt);
    }

    #[test]
    fn reference_view_decodes_bytes_identical_to_in_ram() {
        let idx = sample_index();
        let path = temp_path("refview");
        IndexWriter::create(&path)
            .unwrap()
            .write_genome_index(&idx)
            .unwrap();
        let loaded = IndexReader::open(&path).unwrap();
        let rv = loaded.reference_view().unwrap();
        let reference = idx.reference();

        assert_eq!(rv.len(), reference.len());
        assert!(!rv.is_empty());
        for (i, &expected) in reference.iter().enumerate() {
            assert_eq!(rv.base_at(i), expected, "base_at mismatch @ {i}");
        }

        let mut buf = Vec::new();
        let n = reference.len();
        for (s, e) in [(0usize, 10usize), (6, 14), (12, 22), (0, n), (n - 3, n + 5)] {
            rv.decode_window(s, e, &mut buf);
            assert_eq!(
                buf.as_slice(),
                &reference[s..e.min(n)],
                "decode_window {s}..{e}"
            );
            // The transient-free Arc decoder must be byte-identical to decode_window.
            let arc = rv.decode_window_arc(s, e);
            assert_eq!(&*arc, buf.as_slice(), "decode_window_arc {s}..{e}");
        }
        let _ = std::fs::remove_file(path);
    }

    #[test]
    fn reference_view_is_a_small_borrow() {
        assert!(
            std::mem::size_of::<crate::genomics::ReferenceView<'_>>() <= 64,
            "ReferenceView must be a small borrow, got {}",
            std::mem::size_of::<crate::genomics::ReferenceView<'_>>()
        );
    }

    #[test]
    fn open_rejects_a_misaligned_block_offset() {
        // A block-record offset that is in-bounds but not 8-aligned must be rejected
        // at open() — never deferred to a query-time `.expect("aligned")` panic.
        let idx = sample_index();
        let path = temp_path("misaligned-blockdir");
        IndexWriter::create(&path)
            .unwrap()
            .write_genome_index(&idx)
            .unwrap();
        let mut bytes = std::fs::read(&path).unwrap();

        let header = read_header(&bytes).unwrap();
        let sections = read_section_table(&bytes, &header).unwrap();
        let blocks = sections
            .iter()
            .find(|s| s.kind == SectionKind::Blocks)
            .expect("blocks section");
        // Bump the first record offset (8-aligned by construction) by 1: still
        // in-bounds, now misaligned.
        let dir0 = blocks.offset as usize;
        let mut off = [0u8; 8];
        off.copy_from_slice(&bytes[dir0..dir0 + 8]);
        let bumped = u64::from_le_bytes(off) + 1;
        bytes[dir0..dir0 + 8].copy_from_slice(&bumped.to_le_bytes());

        let corrupt = temp_path("misaligned-blockdir-bad");
        std::fs::write(&corrupt, &bytes).unwrap();
        assert!(
            IndexReader::open(&corrupt).is_err(),
            "open must reject a non-8-aligned block record offset"
        );
        let _ = std::fs::remove_file(path);
        let _ = std::fs::remove_file(corrupt);
    }

    #[test]
    fn view_is_byte_identical_to_in_ram_index() {
        let idx = sample_index();
        let path = temp_path("equiv");
        IndexWriter::create(&path)
            .unwrap()
            .write_genome_index(&idx)
            .unwrap();
        let loaded = IndexReader::open(&path).unwrap();
        let gv = loaded.genome_view().unwrap();
        let fv = gv.fm();
        let fm = idx.fm();

        // backward_search + sa_at over a pattern battery, incl. boundary-straddle
        // and an N-bearing pattern.
        let patterns: &[&[u8]] = &[
            b"A", b"C", b"G", b"T", b"N", b"AC", b"ACG", b"ACGT", b"GT", b"TTTT", b"GGGG", b"CCCC",
            b"AAAA", b"ACGTACG", b"NNAC", b"GTTTTT", b"NNNN", b"TACG", b"CGTACG", b"GTACGT",
        ];
        for &p in patterns {
            let a = fm.backward_search(p);
            let b = fv.backward_search(p);
            assert_eq!(a.lower, b.lower, "lower mismatch for {p:?}");
            assert_eq!(a.upper, b.upper, "upper mismatch for {p:?}");
            // sa_at over the whole interval must agree.
            for i in (a.lower as usize)..(a.upper as usize) {
                assert_eq!(fm.sa_at(i), fv.sa_at(i), "sa_at mismatch @ {i} for {p:?}");
            }
            // locate_exact returns identical Locus vectors.
            assert_eq!(
                idx.locate_exact(p, 1024),
                gv.locate_exact(p, 1024),
                "locate_exact mismatch for {p:?}"
            );
        }

        // Exhaustive rank/symbol_at equivalence over every BWT position.
        use crate::genomics::fm_backing::{self, BwtBacking};
        let _ = <BlockedFMIndex as BwtBacking>::bwt_len;
        for symbol in [
            FmSymbol::Sentinel,
            FmSymbol::Base(BaseCode::A),
            FmSymbol::Base(BaseCode::C),
            FmSymbol::Base(BaseCode::G),
            FmSymbol::Base(BaseCode::T),
            FmSymbol::Base(BaseCode::N),
        ] {
            for pos in 0..=fm.len() {
                assert_eq!(
                    fm.rank(symbol, pos),
                    fm_backing::rank(fv, symbol, pos),
                    "rank mismatch {symbol:?} @ {pos}"
                );
            }
        }
        for i in 0..fm.len() {
            assert_eq!(
                fm.symbol_at(i),
                fm_backing::symbol_at(fv, i),
                "symbol_at @ {i}"
            );
        }
        let _ = std::fs::remove_file(path);
    }
}