Skip to main content

rosalind/genomics/index/
view.rs

1//! Borrowed, zero-copy query view over a memory-mapped index.
2//!
3//! [`FmIndexView`] reads the FM-index sections directly from the mmap as
4//! `&[u64]`/`&[u32]` (via a checked `slice::align_to`) and implements
5//! [`crate::genomics::fm_backing::BwtBacking`]. Because the FM query algorithm is
6//! generic over that trait, the view runs the *same* code as the in-RAM
7//! [`crate::genomics::BlockedFMIndex`] — it can differ only in how it fetches
8//! bytes, never in logic. The zero-copy reinterpretation is value-correct only on
9//! little-endian hosts (the file is little-endian; [`FmIndexView::new`] rejects
10//! big-endian hosts).
11
12use crate::core::{ContigSet, Locus};
13use crate::genomics::fm_backing::{self, BwtBacking};
14use crate::genomics::index::format::{SectionEntry, SectionKind};
15use crate::genomics::index::io::IndexIoError;
16use crate::genomics::{popcount_range, BaseCode, FMInterval, FmSymbol, RANK_STRIDE};
17
18/// A borrowed FM-index over memory-mapped bytes.
19#[derive(Debug)]
20pub struct FmIndexView<'a> {
21    bwt_len: usize,
22    block_size: usize,
23    num_blocks: usize,
24    sentinel_pos: usize,
25    sample_rate: usize,
26    c_table: [u32; 6],
27    /// `6 * (num_blocks + 1)` u32: `[c0..c4, sentinel]` per boundary entry.
28    boundaries: &'a [u32],
29    /// `num_blocks` u64 record offsets, relative to the start of `blocks`.
30    block_dir: &'a [u64],
31    /// The whole `Blocks` section payload (records are sliced on demand).
32    blocks: &'a [u8],
33    sampled: SampledView<'a>,
34}
35
36#[derive(Debug)]
37struct SampledView<'a> {
38    bwt_len: usize,
39    marks: &'a [u64],
40    superblocks: &'a [u32],
41    values: &'a [u32],
42}
43
44/// A parsed block record (computed on demand from the directory; holds only
45/// borrowed slices into the mmap).
46struct BlockView<'a> {
47    start: usize,
48    end: usize,
49    sentinel_offset: Option<usize>,
50    stride: usize,
51    bwt_data: &'a [u64],
52    bwt_amb: &'a [u64],
53    occ_bitvecs: [&'a [u64]; 5],
54    occ_superblocks: [&'a [u32]; 5],
55}
56
57impl<'a> FmIndexView<'a> {
58    /// Build a view from the mmap `bytes` and the parsed `sections`. Validates
59    /// section presence, lengths, and the little-endian host requirement.
60    pub(crate) fn new(bytes: &'a [u8], sections: &[SectionEntry]) -> Result<Self, IndexIoError> {
61        if cfg!(target_endian = "big") {
62            return Err(IndexIoError::Invalid(
63                "zero-copy index view requires a little-endian host".to_string(),
64            ));
65        }
66
67        let fm_meta = section_bytes(bytes, sections, SectionKind::FmMeta)?;
68        let mut o = 0usize;
69        let block_size = read_u64(fm_meta, &mut o)? as usize;
70        let bwt_len = read_u64(fm_meta, &mut o)? as usize;
71        let sentinel_pos = read_u64(fm_meta, &mut o)? as usize;
72        let sample_rate = read_u64(fm_meta, &mut o)? as usize;
73        let num_blocks = read_u64(fm_meta, &mut o)? as usize;
74        let mut c_table = [0u32; 6];
75        for slot in &mut c_table {
76            *slot = read_u32(fm_meta, &mut o)?;
77        }
78
79        let boundaries_bytes = section_bytes(bytes, sections, SectionKind::Boundaries)?;
80        let boundaries = as_u32_slice(boundaries_bytes)?;
81        if boundaries.len() != 6 * (num_blocks + 1) {
82            return Err(IndexIoError::Invalid(
83                "boundaries length mismatch".to_string(),
84            ));
85        }
86
87        let blocks = section_bytes(bytes, sections, SectionKind::Blocks)?;
88        let dir_bytes = num_blocks
89            .checked_mul(8)
90            .ok_or_else(|| IndexIoError::Invalid("block directory overflow".to_string()))?;
91        if dir_bytes > blocks.len() {
92            return Err(IndexIoError::Invalid(
93                "block directory out of bounds".to_string(),
94            ));
95        }
96        let block_dir = as_u64_slice(&blocks[..dir_bytes])?;
97        validate_block_records(blocks, block_dir)?;
98
99        // SaSamples.
100        let sa = section_bytes(bytes, sections, SectionKind::SaSamples)?;
101        let mut so = 0usize;
102        let rate = read_u64(sa, &mut so)? as usize;
103        if rate != sample_rate {
104            return Err(IndexIoError::Invalid(format!(
105                "SaSamples rate {rate} != FmMeta sa_sample_rate {sample_rate}"
106            )));
107        }
108        let s_bwt_len = read_u64(sa, &mut so)? as usize;
109        let marks_words = read_u64(sa, &mut so)? as usize;
110        let sb_len = read_u64(sa, &mut so)? as usize;
111        let values_len = read_u64(sa, &mut so)? as usize;
112        let marks = as_u64_slice(slice_exact(sa, &mut so, marks_words * 8)?)?;
113        let superblocks = as_u32_slice(slice_exact(sa, &mut so, sb_len * 4)?)?;
114        let values = as_u32_slice(slice_exact(sa, &mut so, values_len * 4)?)?;
115
116        Ok(Self {
117            bwt_len,
118            block_size,
119            num_blocks,
120            sentinel_pos,
121            sample_rate,
122            c_table,
123            boundaries,
124            block_dir,
125            blocks,
126            sampled: SampledView {
127                bwt_len: s_bwt_len,
128                marks,
129                superblocks,
130                values,
131            },
132        })
133    }
134
135    /// Exact FM-index backward search (delegates to the shared generic op).
136    pub fn backward_search(&self, pattern: &[u8]) -> FMInterval {
137        fm_backing::backward_search(self, pattern)
138    }
139
140    /// Suffix array value at BWT index `index` (delegates to the shared generic op).
141    pub fn sa_at(&self, index: usize) -> usize {
142        fm_backing::sa_at(self, index)
143    }
144
145    /// Locate up to `max_hits` 0-based reference positions for `interval`.
146    pub fn locate_interval(&self, interval: FMInterval, max_hits: usize) -> Vec<u32> {
147        fm_backing::locate_interval(self, interval, max_hits)
148    }
149
150    /// Parse block record `block_idx` from the directory (borrowed slices only).
151    fn block(&self, block_idx: usize) -> BlockView<'a> {
152        let rec = self.block_dir[block_idx] as usize;
153        let s = &self.blocks[rec..];
154        let mut o = 0usize;
155        let start = read_u64_infallible(s, &mut o) as usize;
156        let end = read_u64_infallible(s, &mut o) as usize;
157        let sentinel_raw = read_i64_infallible(s, &mut o);
158        let stride = read_u64_infallible(s, &mut o) as usize;
159        let bwt_data_words = read_u64_infallible(s, &mut o) as usize;
160        let bwt_amb_words = read_u64_infallible(s, &mut o) as usize;
161        let occ_bitvec_words = read_u64_infallible(s, &mut o) as usize;
162        let occ_superblock_len = read_u64_infallible(s, &mut o) as usize;
163
164        let bwt_data = as_u64_slice(&s[o..o + bwt_data_words * 8]).expect("aligned");
165        o += bwt_data_words * 8;
166        let bwt_amb = as_u64_slice(&s[o..o + bwt_amb_words * 8]).expect("aligned");
167        o += bwt_amb_words * 8;
168
169        let mut occ_bitvecs: [&[u64]; 5] = [&[]; 5];
170        for slot in &mut occ_bitvecs {
171            *slot = as_u64_slice(&s[o..o + occ_bitvec_words * 8]).expect("aligned");
172            o += occ_bitvec_words * 8;
173        }
174        let mut occ_superblocks: [&[u32]; 5] = [&[]; 5];
175        for slot in &mut occ_superblocks {
176            *slot = as_u32_slice(&s[o..o + occ_superblock_len * 4]).expect("aligned");
177            o += occ_superblock_len * 4;
178        }
179
180        BlockView {
181            start,
182            end,
183            sentinel_offset: if sentinel_raw < 0 {
184                None
185            } else {
186                Some(sentinel_raw as usize)
187            },
188            stride,
189            bwt_data,
190            bwt_amb,
191            occ_bitvecs,
192            occ_superblocks,
193        }
194    }
195}
196
197impl BwtBacking for FmIndexView<'_> {
198    fn bwt_len(&self) -> usize {
199        self.bwt_len
200    }
201    fn block_size(&self) -> usize {
202        self.block_size
203    }
204    fn num_blocks(&self) -> usize {
205        self.num_blocks
206    }
207    fn sentinel_pos(&self) -> usize {
208        self.sentinel_pos
209    }
210    fn sample_rate(&self) -> usize {
211        self.sample_rate
212    }
213    fn c_table(&self) -> [u32; 6] {
214        self.c_table
215    }
216    fn boundary_base(&self, block_idx: usize, base_index: usize) -> u32 {
217        self.boundaries[block_idx * 6 + base_index]
218    }
219    fn boundary_sentinel(&self, block_idx: usize) -> u32 {
220        self.boundaries[block_idx * 6 + 5]
221    }
222    fn block_rank(&self, block_idx: usize, symbol: FmSymbol, within: usize) -> u32 {
223        let block = self.block(block_idx);
224        let n = block.end - block.start;
225        let bounded = within.min(n);
226        match symbol {
227            FmSymbol::Sentinel => match block.sentinel_offset {
228                Some(off) if off < bounded => 1,
229                _ => 0,
230            },
231            FmSymbol::Base(code) => {
232                // Mirror RankSelectIndex::rank over the borrowed occ slices.
233                let bi = code.index();
234                let sb = bounded / block.stride;
235                let within_start = sb * block.stride;
236                let mut count = block.occ_superblocks[bi][sb]
237                    + popcount_range(block.occ_bitvecs[bi], within_start, bounded);
238                // Mirror BWTBlock::rank_symbol's N correction: the sentinel is
239                // stored as `N` in the block BWT but is not a real `N`.
240                if code == BaseCode::N {
241                    if let Some(off) = block.sentinel_offset {
242                        if off < bounded {
243                            count = count.saturating_sub(1);
244                        }
245                    }
246                }
247                count
248            }
249        }
250    }
251    fn block_symbol(&self, block_idx: usize, within: usize) -> FmSymbol {
252        let block = self.block(block_idx);
253        // Mirror CompressedDNA::base_at: ambiguity (1 bit/base) marks `N`, else
254        // a 2-bit code (32 bases per u64 word).
255        let amb_word = block.bwt_amb[within / 64];
256        if amb_word & (1u64 << (within % 64)) != 0 {
257            return FmSymbol::Base(BaseCode::N);
258        }
259        let data_word = block.bwt_data[within / 32];
260        let code = ((data_word >> ((within % 32) * 2)) & 0b11) as u8;
261        let base = match code {
262            0 => BaseCode::A,
263            1 => BaseCode::C,
264            2 => BaseCode::G,
265            _ => BaseCode::T,
266        };
267        FmSymbol::Base(base)
268    }
269    fn sampled_at(&self, index: usize) -> Option<u32> {
270        debug_assert!(index < self.sampled.bwt_len);
271        let word = self.sampled.marks[index / 64];
272        if word & (1u64 << (index % 64)) == 0 {
273            return None;
274        }
275        let sb = index / RANK_STRIDE;
276        let rank = self.sampled.superblocks[sb]
277            + popcount_range(self.sampled.marks, sb * RANK_STRIDE, index);
278        Some(self.sampled.values[rank as usize])
279    }
280}
281
282/// An FM-index view paired with the contig set: exact-match queries to `Locus`es.
283#[derive(Debug)]
284pub struct GenomeIndexView<'a> {
285    fm: FmIndexView<'a>,
286    contigs: &'a ContigSet,
287}
288
289impl<'a> GenomeIndexView<'a> {
290    pub(crate) fn new(fm: FmIndexView<'a>, contigs: &'a ContigSet) -> Self {
291        Self { fm, contigs }
292    }
293
294    /// The borrowed FM-index.
295    pub fn fm(&self) -> &FmIndexView<'a> {
296        &self.fm
297    }
298
299    /// The contig set.
300    pub fn contigs(&self) -> &ContigSet {
301        self.contigs
302    }
303
304    /// Locate exact occurrences of `pattern`, returning up to `max_hits` `Locus`es
305    /// with boundary-straddling hits removed, sorted by `(contig, pos)` — the same
306    /// surface (and result) as `GenomeIndex::locate_exact`.
307    pub fn locate_exact(&self, pattern: &[u8], max_hits: usize) -> Vec<Locus> {
308        if pattern.is_empty() {
309            return Vec::new();
310        }
311        let pattern: Vec<u8> = pattern.iter().map(|b| b.to_ascii_uppercase()).collect();
312        let interval = self.fm.backward_search(&pattern);
313        let len = pattern.len() as u32;
314        let mut loci: Vec<Locus> = self
315            .fm
316            .locate_interval(interval, max_hits)
317            .into_iter()
318            .filter_map(|g| self.contigs.resolve_span(g as u64, len))
319            .collect();
320        loci.sort_unstable();
321        loci
322    }
323}
324
325/// A borrowed, zero-copy view over the persisted 2-bit forward reference
326/// (`Reference2bit`). Decodes bases on demand from the mmap — no full-reference
327/// allocation — so consumers (the aligner DP window in B4b, variants' `ref_base`
328/// in B4c) read the reference from the `.idx` alone. Uses **global** (concatenated)
329/// coordinates; `(contig, pos)` mapping stays with `ContigSet`. The little-endian
330/// host requirement is the same as `FmIndexView` (checked in `new`).
331#[derive(Debug)]
332pub struct ReferenceView<'a> {
333    len: usize,
334    /// 2-bit packed bases, 32 per `u64` word (A/C/G/T = 0/1/2/3).
335    data: &'a [u64],
336    /// Ambiguity bits, one per base (a set bit marks `N`).
337    amb: &'a [u64],
338}
339
340impl<'a> ReferenceView<'a> {
341    /// Parse + validate the `Reference2bit` section into a borrowed view.
342    pub(crate) fn new(bytes: &'a [u8], sections: &[SectionEntry]) -> Result<Self, IndexIoError> {
343        if cfg!(target_endian = "big") {
344            return Err(IndexIoError::Invalid(
345                "zero-copy reference view requires a little-endian host".to_string(),
346            ));
347        }
348        let section = section_bytes(bytes, sections, SectionKind::Reference2bit)?;
349        let mut o = 0usize;
350        let len = read_u64(section, &mut o)? as usize;
351        let data_words = read_u64(section, &mut o)? as usize;
352        let amb_words = read_u64(section, &mut o)? as usize;
353        let data = as_u64_slice(slice_exact(section, &mut o, data_words.saturating_mul(8))?)?;
354        let amb = as_u64_slice(slice_exact(section, &mut o, amb_words.saturating_mul(8))?)?;
355        // The slices must cover `len` bases so `base_at` cannot index out of bounds.
356        if data.len().saturating_mul(32) < len || amb.len().saturating_mul(64) < len {
357            return Err(IndexIoError::Invalid(
358                "Reference2bit section too small for the declared length".to_string(),
359            ));
360        }
361        Ok(Self { len, data, amb })
362    }
363
364    /// Number of reference bases.
365    pub fn len(&self) -> usize {
366        self.len
367    }
368
369    /// Whether the reference is empty.
370    pub fn is_empty(&self) -> bool {
371        self.len == 0
372    }
373
374    /// The ASCII base at global position `global`. Mirrors `CompressedDNA::base_at`:
375    /// a set ambiguity bit decodes to `N`, otherwise the 2-bit code maps to A/C/G/T.
376    pub fn base_at(&self, global: usize) -> u8 {
377        debug_assert!(global < self.len, "reference index out of range");
378        if self.amb[global / 64] & (1u64 << (global % 64)) != 0 {
379            return b'N';
380        }
381        let code = ((self.data[global / 32] >> ((global % 32) * 2)) & 0b11) as u8;
382        match code {
383            0 => b'A',
384            1 => b'C',
385            2 => b'G',
386            _ => b'T',
387        }
388    }
389
390    /// Decode `[start, end.min(len))` into `out` (cleared first). `end` is clamped
391    /// to `len`, so an over-range request never panics; `start <= end` is the
392    /// caller's contract. Bounded by the window size — never the whole genome.
393    pub fn decode_window(&self, start: usize, end: usize, out: &mut Vec<u8>) {
394        out.clear();
395        let end = end.min(self.len);
396        for i in start..end {
397            out.push(self.base_at(i));
398        }
399    }
400
401    /// Decode `[start, end.min(len))` directly into an `Arc<[u8]>`, with no
402    /// separate owned `Vec` materialized first. `Arc::from(Vec<u8>)` REALLOCATES
403    /// — source and Arc buffers are both fully resident during the copy, so a
404    /// per-contig decode briefly holds two copies of the contig (the reference-
405    /// decode transient that broke the predicted-peak bound). Collecting a
406    /// `TrustedLen` range straight into the `Arc` writes into its allocation in
407    /// place, so peak resident reference memory is ONE copy. Byte-identical to
408    /// `decode_window`; bounded by the window size.
409    pub fn decode_window_arc(&self, start: usize, end: usize) -> std::sync::Arc<[u8]> {
410        let end = end.min(self.len);
411        (start..end).map(|i| self.base_at(i)).collect()
412    }
413}
414
415/// Validate that every block record (at the directory's offsets) lies fully
416/// within the `Blocks` section, using checked arithmetic so a corrupt word-count
417/// can never overflow into an in-bounds-but-wrong slice. Lets `block()` use
418/// infallible slicing on a file that passed `open()`.
419fn validate_block_records(blocks: &[u8], block_dir: &[u64]) -> Result<(), IndexIoError> {
420    const HEADER: usize = 64; // 8 u64/i64 fields
421    for &rec_off in block_dir {
422        let rec = rec_off as usize;
423        // The record's u64 arrays must be 8-aligned for `block()`'s zero-copy
424        // `align_to` to succeed. Records are 8-aligned by construction; reject a
425        // corrupt non-aligned offset here at `open` rather than letting `block()`
426        // hit `.expect("aligned")` at query time (the integrity contract: a corrupt
427        // index is rejected up front, never crashed-on later).
428        // `% != 0` (not `usize::is_multiple_of`, stable only in 1.87) to hold MSRV 1.83.
429        #[allow(clippy::manual_is_multiple_of)]
430        if rec % 8 != 0 {
431            return Err(IndexIoError::Invalid(
432                "block record offset is not 8-aligned".to_string(),
433            ));
434        }
435        let header_end = rec
436            .checked_add(HEADER)
437            .filter(|&e| e <= blocks.len())
438            .ok_or_else(|| {
439                IndexIoError::Invalid("block record header out of bounds".to_string())
440            })?;
441        // Re-read the four word-count fields (header layout: start,end,sentinel,
442        // stride at [rec..rec+32]; then the 4 counts at [rec+32..rec+64]).
443        let mut o = rec + 32;
444        let bwt_data_words = read_u64(blocks, &mut o)? as usize;
445        let bwt_amb_words = read_u64(blocks, &mut o)? as usize;
446        let occ_bitvec_words = read_u64(blocks, &mut o)? as usize;
447        let occ_superblock_len = read_u64(blocks, &mut o)? as usize;
448        debug_assert_eq!(o, header_end);
449        // Bytes block() touches after the header: bwt_data + bwt_amb + 5×bitvec
450        // (u64) + 5×superblock (u32). (totals/pad are not read by block().)
451        let payload = bwt_data_words
452            .checked_mul(8)
453            .and_then(|x| bwt_amb_words.checked_mul(8).and_then(|y| x.checked_add(y)))
454            .and_then(|x| {
455                occ_bitvec_words
456                    .checked_mul(8)
457                    .and_then(|y| y.checked_mul(5))
458                    .and_then(|y| x.checked_add(y))
459            })
460            .and_then(|x| {
461                occ_superblock_len
462                    .checked_mul(4)
463                    .and_then(|y| y.checked_mul(5))
464                    .and_then(|y| x.checked_add(y))
465            })
466            .ok_or_else(|| IndexIoError::Invalid("block record size overflow".to_string()))?;
467        let rec_end = header_end
468            .checked_add(payload)
469            .filter(|&e| e <= blocks.len())
470            .ok_or_else(|| {
471                IndexIoError::Invalid("block record payload out of bounds".to_string())
472            })?;
473        let _ = rec_end;
474    }
475    Ok(())
476}
477
478/// The bytes of section `kind`.
479fn section_bytes<'a>(
480    bytes: &'a [u8],
481    sections: &[SectionEntry],
482    kind: SectionKind,
483) -> Result<&'a [u8], IndexIoError> {
484    let s = sections
485        .iter()
486        .find(|s| s.kind == kind)
487        .ok_or_else(|| IndexIoError::Invalid(format!("missing section {kind:?}")))?;
488    let start = s.offset as usize;
489    let end = start + s.bytes as usize;
490    bytes
491        .get(start..end)
492        .ok_or_else(|| IndexIoError::Invalid(format!("section {kind:?} out of bounds")))
493}
494
495/// Reinterpret a length-8k byte slice (8-aligned start) as `&[u64]`.
496fn as_u64_slice(bytes: &[u8]) -> Result<&[u64], IndexIoError> {
497    // SAFETY: any byte pattern is a valid `u64`; we require an empty prefix
498    // (start is 8-aligned) and empty suffix (length is a multiple of 8). Values
499    // are correct on little-endian hosts (checked in `FmIndexView::new`).
500    let (prefix, mid, suffix) = unsafe { bytes.align_to::<u64>() };
501    if !prefix.is_empty() || !suffix.is_empty() {
502        return Err(IndexIoError::Invalid("misaligned u64 array".to_string()));
503    }
504    Ok(mid)
505}
506
507/// Reinterpret a length-4k byte slice (4-aligned start) as `&[u32]`.
508fn as_u32_slice(bytes: &[u8]) -> Result<&[u32], IndexIoError> {
509    // SAFETY: as `as_u64_slice`, for `u32` (4-aligned, length a multiple of 4).
510    let (prefix, mid, suffix) = unsafe { bytes.align_to::<u32>() };
511    if !prefix.is_empty() || !suffix.is_empty() {
512        return Err(IndexIoError::Invalid("misaligned u32 array".to_string()));
513    }
514    Ok(mid)
515}
516
517/// Take the next `len` bytes from `bytes` starting at `*offset`, advancing it.
518fn slice_exact<'a>(
519    bytes: &'a [u8],
520    offset: &mut usize,
521    len: usize,
522) -> Result<&'a [u8], IndexIoError> {
523    let end = offset
524        .checked_add(len)
525        .ok_or_else(|| IndexIoError::Invalid("slice overflow".to_string()))?;
526    let out = bytes
527        .get(*offset..end)
528        .ok_or_else(|| IndexIoError::Invalid("slice out of bounds".to_string()))?;
529    *offset = end;
530    Ok(out)
531}
532
533fn read_u32(bytes: &[u8], offset: &mut usize) -> Result<u32, IndexIoError> {
534    let end = *offset + 4;
535    let buf = bytes
536        .get(*offset..end)
537        .ok_or_else(|| IndexIoError::Invalid("unexpected EOF".to_string()))?;
538    *offset = end;
539    Ok(u32::from_le_bytes(buf.try_into().unwrap()))
540}
541
542fn read_u64(bytes: &[u8], offset: &mut usize) -> Result<u64, IndexIoError> {
543    let end = *offset + 8;
544    let buf = bytes
545        .get(*offset..end)
546        .ok_or_else(|| IndexIoError::Invalid("unexpected EOF".to_string()))?;
547    *offset = end;
548    Ok(u64::from_le_bytes(buf.try_into().unwrap()))
549}
550
551fn read_u64_infallible(bytes: &[u8], offset: &mut usize) -> u64 {
552    let v = u64::from_le_bytes(bytes[*offset..*offset + 8].try_into().unwrap());
553    *offset += 8;
554    v
555}
556
557fn read_i64_infallible(bytes: &[u8], offset: &mut usize) -> i64 {
558    let v = i64::from_le_bytes(bytes[*offset..*offset + 8].try_into().unwrap());
559    *offset += 8;
560    v
561}