use std::sync::Arc;
use thiserror::Error;
use crate::core::{ContigSet, Locus};
use crate::genomics::{BlockedFMIndex, FMIndexError};
pub const MAX_GENOME_LEN: u64 = u32::MAX as u64 - 1;
#[derive(Debug, Error)]
pub enum GenomeIndexError {
#[error("concatenated genome length {len} exceeds the {max}-byte limit (u32 suffix array)")]
GenomeTooLarge {
len: u64,
max: u64,
},
#[error("genome index requires a non-empty reference")]
EmptyGenome,
#[error("fm-index error: {0}")]
FmIndex(#[from] FMIndexError),
}
fn validate_concat_len(len: u64) -> Result<(), GenomeIndexError> {
if len == 0 {
return Err(GenomeIndexError::EmptyGenome);
}
if len > MAX_GENOME_LEN {
return Err(GenomeIndexError::GenomeTooLarge {
len,
max: MAX_GENOME_LEN,
});
}
Ok(())
}
#[derive(Debug)]
pub struct GenomeIndex {
fm: BlockedFMIndex,
contigs: ContigSet,
reference: Arc<[u8]>,
}
impl GenomeIndex {
pub fn build(contigs: ContigSet, reference: Arc<[u8]>) -> Result<Self, GenomeIndexError> {
validate_concat_len(reference.len() as u64)?;
let reference: Arc<[u8]> = Arc::from(
reference
.iter()
.map(|b| b.to_ascii_uppercase())
.collect::<Vec<u8>>()
.into_boxed_slice(),
);
let block_size = ((reference.len() as f64).sqrt().ceil() as usize).max(64);
let fm = BlockedFMIndex::build(&reference, block_size)?;
Ok(Self {
fm,
contigs,
reference,
})
}
pub fn from_named_sequences(named: &[(String, Vec<u8>)]) -> Result<Self, GenomeIndexError> {
let mut contigs = ContigSet::new();
let total: usize = named.iter().map(|(_, seq)| seq.len()).sum();
let mut concat = Vec::with_capacity(total);
for (name, seq) in named {
contigs.push(name.clone(), seq.len() as u32);
concat.extend_from_slice(seq);
}
Self::build(contigs, Arc::from(concat.into_boxed_slice()))
}
pub fn contigs(&self) -> &ContigSet {
&self.contigs
}
pub fn reference(&self) -> &[u8] {
&self.reference
}
pub fn fm(&self) -> &BlockedFMIndex {
&self.fm
}
pub fn locate_exact(&self, pattern: &[u8], max_hits: usize) -> Vec<Locus> {
if pattern.is_empty() {
return Vec::new();
}
let pattern: Vec<u8> = pattern.iter().map(|b| b.to_ascii_uppercase()).collect();
let interval = self.fm.backward_search(&pattern);
let len = pattern.len() as u32;
let mut loci: Vec<Locus> = self
.fm
.locate_interval(interval, max_hits)
.into_iter()
.filter_map(|g| self.contigs.resolve_span(g as u64, len))
.collect();
loci.sort_unstable();
loci
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::core::Position;
fn two_contig_index() -> GenomeIndex {
let mut contigs = ContigSet::new();
contigs.push("chr1", 8);
contigs.push("chr2", 8);
let reference: Arc<[u8]> = Arc::from(b"ACGTACGTTTTTGGGG".to_vec().into_boxed_slice());
GenomeIndex::build(contigs, reference).expect("build should succeed")
}
fn naive_loci(reference: &[u8], pattern: &[u8], contigs: &ContigSet) -> Vec<Locus> {
let mut out = Vec::new();
if pattern.is_empty() || pattern.len() > reference.len() {
return out;
}
for start in 0..=reference.len() - pattern.len() {
if &reference[start..start + pattern.len()] == pattern {
if let Some(locus) = contigs.resolve_span(start as u64, pattern.len() as u32) {
out.push(locus);
}
}
}
out.sort_unstable();
out
}
#[test]
fn stores_contigs_and_reference() {
let idx = two_contig_index();
assert_eq!(idx.contigs().len(), 2);
assert_eq!(idx.contigs().by_name("chr2").unwrap().global_offset, 8);
assert_eq!(idx.reference(), b"ACGTACGTTTTTGGGG");
}
#[test]
fn locates_a_pattern_unique_to_chr2() {
let idx = two_contig_index();
assert_eq!(
idx.locate_exact(b"GGGG", 16),
vec![Locus {
contig: 1,
pos: Position(4)
}]
);
}
#[test]
fn locates_a_pattern_unique_to_chr1() {
let idx = two_contig_index();
assert_eq!(
idx.locate_exact(b"ACGTAC", 16),
vec![Locus {
contig: 0,
pos: Position(0)
}]
);
}
#[test]
fn rejects_a_boundary_straddling_match() {
let idx = two_contig_index();
let loci = idx.locate_exact(b"GTTTTT", 16);
assert!(
loci.is_empty(),
"boundary-straddling hit must be rejected, got {loci:?}"
);
}
#[test]
fn exact_match_matches_a_naive_scan() {
let idx = two_contig_index();
let reference = idx.reference().to_vec();
for pattern in [
b"ACGT".as_slice(),
b"GT".as_slice(),
b"TTTT".as_slice(),
b"T".as_slice(),
b"CG".as_slice(),
b"GTTTTT".as_slice(),
] {
let expected = naive_loci(&reference, pattern, idx.contigs());
let got = idx.locate_exact(pattern, 1024);
assert_eq!(
got,
expected,
"mismatch for pattern {:?}",
std::str::from_utf8(pattern).unwrap()
);
}
}
#[test]
fn locate_exact_is_deterministic() {
let idx = two_contig_index();
assert_eq!(idx.locate_exact(b"T", 1024), idx.locate_exact(b"T", 1024));
}
#[test]
fn empty_pattern_returns_nothing() {
let idx = two_contig_index();
assert!(idx.locate_exact(b"", 16).is_empty());
}
#[test]
fn validate_concat_len_enforces_the_u32_ceiling() {
assert!(matches!(
validate_concat_len(0),
Err(GenomeIndexError::EmptyGenome)
));
assert!(validate_concat_len(1).is_ok());
assert!(validate_concat_len(MAX_GENOME_LEN).is_ok());
assert!(matches!(
validate_concat_len(MAX_GENOME_LEN + 1),
Err(GenomeIndexError::GenomeTooLarge { .. })
));
assert!(matches!(
validate_concat_len(u32::MAX as u64),
Err(GenomeIndexError::GenomeTooLarge { .. })
));
}
#[test]
fn max_hits_caps_candidates() {
let idx = two_contig_index();
assert_eq!(idx.locate_exact(b"ACGT", 16).len(), 2);
assert_eq!(idx.locate_exact(b"ACGT", 1).len(), 1);
}
#[test]
fn from_named_sequences_builds_a_multi_contig_index() {
let idx = GenomeIndex::from_named_sequences(&[
("chr1".to_string(), b"ACGTACGT".to_vec()),
("chr2".to_string(), b"TTTTGGGG".to_vec()),
("chr3".to_string(), b"CCCCAAAA".to_vec()),
])
.expect("build should succeed");
assert_eq!(idx.contigs().len(), 3);
assert_eq!(idx.reference(), b"ACGTACGTTTTTGGGGCCCCAAAA");
assert_eq!(
idx.locate_exact(b"CCCC", 16),
vec![Locus {
contig: 2,
pos: Position(0)
}]
);
}
#[test]
fn from_named_sequences_rejects_an_empty_genome() {
assert!(matches!(
GenomeIndex::from_named_sequences(&[]),
Err(GenomeIndexError::EmptyGenome)
));
}
}