1use crate::core::{ContigSet, Locus};
13use crate::genomics::fm_backing::{self, BwtBacking};
14use crate::genomics::index::format::{SectionEntry, SectionKind};
15use crate::genomics::index::io::IndexIoError;
16use crate::genomics::{popcount_range, BaseCode, FMInterval, FmSymbol, RANK_STRIDE};
17
18#[derive(Debug)]
20pub struct FmIndexView<'a> {
21 bwt_len: usize,
22 block_size: usize,
23 num_blocks: usize,
24 sentinel_pos: usize,
25 sample_rate: usize,
26 c_table: [u32; 6],
27 boundaries: &'a [u32],
29 block_dir: &'a [u64],
31 blocks: &'a [u8],
33 sampled: SampledView<'a>,
34}
35
36#[derive(Debug)]
37struct SampledView<'a> {
38 bwt_len: usize,
39 marks: &'a [u64],
40 superblocks: &'a [u32],
41 values: &'a [u32],
42}
43
44struct BlockView<'a> {
47 start: usize,
48 end: usize,
49 sentinel_offset: Option<usize>,
50 stride: usize,
51 bwt_data: &'a [u64],
52 bwt_amb: &'a [u64],
53 occ_bitvecs: [&'a [u64]; 5],
54 occ_superblocks: [&'a [u32]; 5],
55}
56
57impl<'a> FmIndexView<'a> {
58 pub(crate) fn new(bytes: &'a [u8], sections: &[SectionEntry]) -> Result<Self, IndexIoError> {
61 if cfg!(target_endian = "big") {
62 return Err(IndexIoError::Invalid(
63 "zero-copy index view requires a little-endian host".to_string(),
64 ));
65 }
66
67 let fm_meta = section_bytes(bytes, sections, SectionKind::FmMeta)?;
68 let mut o = 0usize;
69 let block_size = read_u64(fm_meta, &mut o)? as usize;
70 let bwt_len = read_u64(fm_meta, &mut o)? as usize;
71 let sentinel_pos = read_u64(fm_meta, &mut o)? as usize;
72 let sample_rate = read_u64(fm_meta, &mut o)? as usize;
73 let num_blocks = read_u64(fm_meta, &mut o)? as usize;
74 let mut c_table = [0u32; 6];
75 for slot in &mut c_table {
76 *slot = read_u32(fm_meta, &mut o)?;
77 }
78
79 let boundaries_bytes = section_bytes(bytes, sections, SectionKind::Boundaries)?;
80 let boundaries = as_u32_slice(boundaries_bytes)?;
81 if boundaries.len() != 6 * (num_blocks + 1) {
82 return Err(IndexIoError::Invalid(
83 "boundaries length mismatch".to_string(),
84 ));
85 }
86
87 let blocks = section_bytes(bytes, sections, SectionKind::Blocks)?;
88 let dir_bytes = num_blocks
89 .checked_mul(8)
90 .ok_or_else(|| IndexIoError::Invalid("block directory overflow".to_string()))?;
91 if dir_bytes > blocks.len() {
92 return Err(IndexIoError::Invalid(
93 "block directory out of bounds".to_string(),
94 ));
95 }
96 let block_dir = as_u64_slice(&blocks[..dir_bytes])?;
97 validate_block_records(blocks, block_dir)?;
98
99 let sa = section_bytes(bytes, sections, SectionKind::SaSamples)?;
101 let mut so = 0usize;
102 let rate = read_u64(sa, &mut so)? as usize;
103 if rate != sample_rate {
104 return Err(IndexIoError::Invalid(format!(
105 "SaSamples rate {rate} != FmMeta sa_sample_rate {sample_rate}"
106 )));
107 }
108 let s_bwt_len = read_u64(sa, &mut so)? as usize;
109 let marks_words = read_u64(sa, &mut so)? as usize;
110 let sb_len = read_u64(sa, &mut so)? as usize;
111 let values_len = read_u64(sa, &mut so)? as usize;
112 let marks = as_u64_slice(slice_exact(sa, &mut so, marks_words * 8)?)?;
113 let superblocks = as_u32_slice(slice_exact(sa, &mut so, sb_len * 4)?)?;
114 let values = as_u32_slice(slice_exact(sa, &mut so, values_len * 4)?)?;
115
116 Ok(Self {
117 bwt_len,
118 block_size,
119 num_blocks,
120 sentinel_pos,
121 sample_rate,
122 c_table,
123 boundaries,
124 block_dir,
125 blocks,
126 sampled: SampledView {
127 bwt_len: s_bwt_len,
128 marks,
129 superblocks,
130 values,
131 },
132 })
133 }
134
135 pub fn backward_search(&self, pattern: &[u8]) -> FMInterval {
137 fm_backing::backward_search(self, pattern)
138 }
139
140 pub fn sa_at(&self, index: usize) -> usize {
142 fm_backing::sa_at(self, index)
143 }
144
145 pub fn locate_interval(&self, interval: FMInterval, max_hits: usize) -> Vec<u32> {
147 fm_backing::locate_interval(self, interval, max_hits)
148 }
149
150 fn block(&self, block_idx: usize) -> BlockView<'a> {
152 let rec = self.block_dir[block_idx] as usize;
153 let s = &self.blocks[rec..];
154 let mut o = 0usize;
155 let start = read_u64_infallible(s, &mut o) as usize;
156 let end = read_u64_infallible(s, &mut o) as usize;
157 let sentinel_raw = read_i64_infallible(s, &mut o);
158 let stride = read_u64_infallible(s, &mut o) as usize;
159 let bwt_data_words = read_u64_infallible(s, &mut o) as usize;
160 let bwt_amb_words = read_u64_infallible(s, &mut o) as usize;
161 let occ_bitvec_words = read_u64_infallible(s, &mut o) as usize;
162 let occ_superblock_len = read_u64_infallible(s, &mut o) as usize;
163
164 let bwt_data = as_u64_slice(&s[o..o + bwt_data_words * 8]).expect("aligned");
165 o += bwt_data_words * 8;
166 let bwt_amb = as_u64_slice(&s[o..o + bwt_amb_words * 8]).expect("aligned");
167 o += bwt_amb_words * 8;
168
169 let mut occ_bitvecs: [&[u64]; 5] = [&[]; 5];
170 for slot in &mut occ_bitvecs {
171 *slot = as_u64_slice(&s[o..o + occ_bitvec_words * 8]).expect("aligned");
172 o += occ_bitvec_words * 8;
173 }
174 let mut occ_superblocks: [&[u32]; 5] = [&[]; 5];
175 for slot in &mut occ_superblocks {
176 *slot = as_u32_slice(&s[o..o + occ_superblock_len * 4]).expect("aligned");
177 o += occ_superblock_len * 4;
178 }
179
180 BlockView {
181 start,
182 end,
183 sentinel_offset: if sentinel_raw < 0 {
184 None
185 } else {
186 Some(sentinel_raw as usize)
187 },
188 stride,
189 bwt_data,
190 bwt_amb,
191 occ_bitvecs,
192 occ_superblocks,
193 }
194 }
195}
196
197impl BwtBacking for FmIndexView<'_> {
198 fn bwt_len(&self) -> usize {
199 self.bwt_len
200 }
201 fn block_size(&self) -> usize {
202 self.block_size
203 }
204 fn num_blocks(&self) -> usize {
205 self.num_blocks
206 }
207 fn sentinel_pos(&self) -> usize {
208 self.sentinel_pos
209 }
210 fn sample_rate(&self) -> usize {
211 self.sample_rate
212 }
213 fn c_table(&self) -> [u32; 6] {
214 self.c_table
215 }
216 fn boundary_base(&self, block_idx: usize, base_index: usize) -> u32 {
217 self.boundaries[block_idx * 6 + base_index]
218 }
219 fn boundary_sentinel(&self, block_idx: usize) -> u32 {
220 self.boundaries[block_idx * 6 + 5]
221 }
222 fn block_rank(&self, block_idx: usize, symbol: FmSymbol, within: usize) -> u32 {
223 let block = self.block(block_idx);
224 let n = block.end - block.start;
225 let bounded = within.min(n);
226 match symbol {
227 FmSymbol::Sentinel => match block.sentinel_offset {
228 Some(off) if off < bounded => 1,
229 _ => 0,
230 },
231 FmSymbol::Base(code) => {
232 let bi = code.index();
234 let sb = bounded / block.stride;
235 let within_start = sb * block.stride;
236 let mut count = block.occ_superblocks[bi][sb]
237 + popcount_range(block.occ_bitvecs[bi], within_start, bounded);
238 if code == BaseCode::N {
241 if let Some(off) = block.sentinel_offset {
242 if off < bounded {
243 count = count.saturating_sub(1);
244 }
245 }
246 }
247 count
248 }
249 }
250 }
251 fn block_symbol(&self, block_idx: usize, within: usize) -> FmSymbol {
252 let block = self.block(block_idx);
253 let amb_word = block.bwt_amb[within / 64];
256 if amb_word & (1u64 << (within % 64)) != 0 {
257 return FmSymbol::Base(BaseCode::N);
258 }
259 let data_word = block.bwt_data[within / 32];
260 let code = ((data_word >> ((within % 32) * 2)) & 0b11) as u8;
261 let base = match code {
262 0 => BaseCode::A,
263 1 => BaseCode::C,
264 2 => BaseCode::G,
265 _ => BaseCode::T,
266 };
267 FmSymbol::Base(base)
268 }
269 fn sampled_at(&self, index: usize) -> Option<u32> {
270 debug_assert!(index < self.sampled.bwt_len);
271 let word = self.sampled.marks[index / 64];
272 if word & (1u64 << (index % 64)) == 0 {
273 return None;
274 }
275 let sb = index / RANK_STRIDE;
276 let rank = self.sampled.superblocks[sb]
277 + popcount_range(self.sampled.marks, sb * RANK_STRIDE, index);
278 Some(self.sampled.values[rank as usize])
279 }
280}
281
282#[derive(Debug)]
284pub struct GenomeIndexView<'a> {
285 fm: FmIndexView<'a>,
286 contigs: &'a ContigSet,
287}
288
289impl<'a> GenomeIndexView<'a> {
290 pub(crate) fn new(fm: FmIndexView<'a>, contigs: &'a ContigSet) -> Self {
291 Self { fm, contigs }
292 }
293
294 pub fn fm(&self) -> &FmIndexView<'a> {
296 &self.fm
297 }
298
299 pub fn contigs(&self) -> &ContigSet {
301 self.contigs
302 }
303
304 pub fn locate_exact(&self, pattern: &[u8], max_hits: usize) -> Vec<Locus> {
308 if pattern.is_empty() {
309 return Vec::new();
310 }
311 let pattern: Vec<u8> = pattern.iter().map(|b| b.to_ascii_uppercase()).collect();
312 let interval = self.fm.backward_search(&pattern);
313 let len = pattern.len() as u32;
314 let mut loci: Vec<Locus> = self
315 .fm
316 .locate_interval(interval, max_hits)
317 .into_iter()
318 .filter_map(|g| self.contigs.resolve_span(g as u64, len))
319 .collect();
320 loci.sort_unstable();
321 loci
322 }
323}
324
325#[derive(Debug)]
332pub struct ReferenceView<'a> {
333 len: usize,
334 data: &'a [u64],
336 amb: &'a [u64],
338}
339
340impl<'a> ReferenceView<'a> {
341 pub(crate) fn new(bytes: &'a [u8], sections: &[SectionEntry]) -> Result<Self, IndexIoError> {
343 if cfg!(target_endian = "big") {
344 return Err(IndexIoError::Invalid(
345 "zero-copy reference view requires a little-endian host".to_string(),
346 ));
347 }
348 let section = section_bytes(bytes, sections, SectionKind::Reference2bit)?;
349 let mut o = 0usize;
350 let len = read_u64(section, &mut o)? as usize;
351 let data_words = read_u64(section, &mut o)? as usize;
352 let amb_words = read_u64(section, &mut o)? as usize;
353 let data = as_u64_slice(slice_exact(section, &mut o, data_words.saturating_mul(8))?)?;
354 let amb = as_u64_slice(slice_exact(section, &mut o, amb_words.saturating_mul(8))?)?;
355 if data.len().saturating_mul(32) < len || amb.len().saturating_mul(64) < len {
357 return Err(IndexIoError::Invalid(
358 "Reference2bit section too small for the declared length".to_string(),
359 ));
360 }
361 Ok(Self { len, data, amb })
362 }
363
364 pub fn len(&self) -> usize {
366 self.len
367 }
368
369 pub fn is_empty(&self) -> bool {
371 self.len == 0
372 }
373
374 pub fn base_at(&self, global: usize) -> u8 {
377 debug_assert!(global < self.len, "reference index out of range");
378 if self.amb[global / 64] & (1u64 << (global % 64)) != 0 {
379 return b'N';
380 }
381 let code = ((self.data[global / 32] >> ((global % 32) * 2)) & 0b11) as u8;
382 match code {
383 0 => b'A',
384 1 => b'C',
385 2 => b'G',
386 _ => b'T',
387 }
388 }
389
390 pub fn decode_window(&self, start: usize, end: usize, out: &mut Vec<u8>) {
394 out.clear();
395 let end = end.min(self.len);
396 for i in start..end {
397 out.push(self.base_at(i));
398 }
399 }
400
401 pub fn decode_window_arc(&self, start: usize, end: usize) -> std::sync::Arc<[u8]> {
410 let end = end.min(self.len);
411 (start..end).map(|i| self.base_at(i)).collect()
412 }
413}
414
415fn validate_block_records(blocks: &[u8], block_dir: &[u64]) -> Result<(), IndexIoError> {
420 const HEADER: usize = 64; for &rec_off in block_dir {
422 let rec = rec_off as usize;
423 #[allow(clippy::manual_is_multiple_of)]
430 if rec % 8 != 0 {
431 return Err(IndexIoError::Invalid(
432 "block record offset is not 8-aligned".to_string(),
433 ));
434 }
435 let header_end = rec
436 .checked_add(HEADER)
437 .filter(|&e| e <= blocks.len())
438 .ok_or_else(|| {
439 IndexIoError::Invalid("block record header out of bounds".to_string())
440 })?;
441 let mut o = rec + 32;
444 let bwt_data_words = read_u64(blocks, &mut o)? as usize;
445 let bwt_amb_words = read_u64(blocks, &mut o)? as usize;
446 let occ_bitvec_words = read_u64(blocks, &mut o)? as usize;
447 let occ_superblock_len = read_u64(blocks, &mut o)? as usize;
448 debug_assert_eq!(o, header_end);
449 let payload = bwt_data_words
452 .checked_mul(8)
453 .and_then(|x| bwt_amb_words.checked_mul(8).and_then(|y| x.checked_add(y)))
454 .and_then(|x| {
455 occ_bitvec_words
456 .checked_mul(8)
457 .and_then(|y| y.checked_mul(5))
458 .and_then(|y| x.checked_add(y))
459 })
460 .and_then(|x| {
461 occ_superblock_len
462 .checked_mul(4)
463 .and_then(|y| y.checked_mul(5))
464 .and_then(|y| x.checked_add(y))
465 })
466 .ok_or_else(|| IndexIoError::Invalid("block record size overflow".to_string()))?;
467 let rec_end = header_end
468 .checked_add(payload)
469 .filter(|&e| e <= blocks.len())
470 .ok_or_else(|| {
471 IndexIoError::Invalid("block record payload out of bounds".to_string())
472 })?;
473 let _ = rec_end;
474 }
475 Ok(())
476}
477
478fn section_bytes<'a>(
480 bytes: &'a [u8],
481 sections: &[SectionEntry],
482 kind: SectionKind,
483) -> Result<&'a [u8], IndexIoError> {
484 let s = sections
485 .iter()
486 .find(|s| s.kind == kind)
487 .ok_or_else(|| IndexIoError::Invalid(format!("missing section {kind:?}")))?;
488 let start = s.offset as usize;
489 let end = start + s.bytes as usize;
490 bytes
491 .get(start..end)
492 .ok_or_else(|| IndexIoError::Invalid(format!("section {kind:?} out of bounds")))
493}
494
495fn as_u64_slice(bytes: &[u8]) -> Result<&[u64], IndexIoError> {
497 let (prefix, mid, suffix) = unsafe { bytes.align_to::<u64>() };
501 if !prefix.is_empty() || !suffix.is_empty() {
502 return Err(IndexIoError::Invalid("misaligned u64 array".to_string()));
503 }
504 Ok(mid)
505}
506
507fn as_u32_slice(bytes: &[u8]) -> Result<&[u32], IndexIoError> {
509 let (prefix, mid, suffix) = unsafe { bytes.align_to::<u32>() };
511 if !prefix.is_empty() || !suffix.is_empty() {
512 return Err(IndexIoError::Invalid("misaligned u32 array".to_string()));
513 }
514 Ok(mid)
515}
516
517fn slice_exact<'a>(
519 bytes: &'a [u8],
520 offset: &mut usize,
521 len: usize,
522) -> Result<&'a [u8], IndexIoError> {
523 let end = offset
524 .checked_add(len)
525 .ok_or_else(|| IndexIoError::Invalid("slice overflow".to_string()))?;
526 let out = bytes
527 .get(*offset..end)
528 .ok_or_else(|| IndexIoError::Invalid("slice out of bounds".to_string()))?;
529 *offset = end;
530 Ok(out)
531}
532
533fn read_u32(bytes: &[u8], offset: &mut usize) -> Result<u32, IndexIoError> {
534 let end = *offset + 4;
535 let buf = bytes
536 .get(*offset..end)
537 .ok_or_else(|| IndexIoError::Invalid("unexpected EOF".to_string()))?;
538 *offset = end;
539 Ok(u32::from_le_bytes(buf.try_into().unwrap()))
540}
541
542fn read_u64(bytes: &[u8], offset: &mut usize) -> Result<u64, IndexIoError> {
543 let end = *offset + 8;
544 let buf = bytes
545 .get(*offset..end)
546 .ok_or_else(|| IndexIoError::Invalid("unexpected EOF".to_string()))?;
547 *offset = end;
548 Ok(u64::from_le_bytes(buf.try_into().unwrap()))
549}
550
551fn read_u64_infallible(bytes: &[u8], offset: &mut usize) -> u64 {
552 let v = u64::from_le_bytes(bytes[*offset..*offset + 8].try_into().unwrap());
553 *offset += 8;
554 v
555}
556
557fn read_i64_infallible(bytes: &[u8], offset: &mut usize) -> i64 {
558 let v = i64::from_le_bytes(bytes[*offset..*offset + 8].try_into().unwrap());
559 *offset += 8;
560 v
561}