use std::collections::HashMap;
use std::path::Path;
use memmap2::Mmap;
use noodles_fasta::fai;
use crate::fasta::SidecarCache;
use refget_model::SequenceMetadata;
use crate::fasta::{DigestCache, FastaSequenceSummary, index_digests, read_fai_index};
use crate::{SequenceStore, StoreError, StoreResult};
pub struct MmapSequenceStore {
digest_index: HashMap<String, usize>,
records: Vec<MmapRecordInfo>,
mmaps: Vec<Mmap>,
}
struct MmapRecordInfo {
mmap_idx: usize,
metadata: SequenceMetadata,
fai_offset: u64,
fai_line_bases: u64,
fai_line_width: u64,
}
impl MmapSequenceStore {
pub fn new() -> Self {
Self { digest_index: HashMap::new(), records: Vec::new(), mmaps: Vec::new() }
}
pub fn mark_circular(&mut self, circular_names: &[String]) {
for record in &mut self.records {
if circular_names.iter().any(|n| n.as_str() == record.metadata.aliases[0].value) {
record.metadata.circular = true;
}
}
}
pub fn add_fasta<P: AsRef<Path>>(&mut self, path: P) -> StoreResult<Vec<FastaSequenceSummary>> {
let path = path.as_ref();
let cache = DigestCache::load_if_fresh(path).ok_or_else(|| {
StoreError::Fasta(format!(
"Disk mode requires a fresh .refget.json cache for {}. \
Run `refget-tools cache {}` first.",
path.display(),
path.display()
))
})?;
let index = read_fai_index(path)?;
let fai_records: &[fai::Record] = index.as_ref();
if cache.sequences.len() != fai_records.len() {
return Err(StoreError::Fasta(format!(
"Digest cache has {} entries but FAI index has {} for {}",
cache.sequences.len(),
fai_records.len(),
path.display()
)));
}
let file = std::fs::File::open(path)?;
let mmap = unsafe { Mmap::map(&file) }
.map_err(|e| StoreError::Fasta(format!("Failed to mmap {}: {e}", path.display())))?;
let mmap_idx = self.mmaps.len();
self.mmaps.push(mmap);
let mut summaries = Vec::new();
for (cached, fai_rec) in cache.sequences.iter().zip(fai_records.iter()) {
if fai_rec.length() != cached.length {
return Err(StoreError::Fasta(format!(
"FAI/cache length mismatch for {}: FAI says {}, cache says {}",
cached.name,
fai_rec.length(),
cached.length
)));
}
let metadata = cached.to_metadata();
summaries.push(cached.to_summary());
let record_idx = self.records.len();
index_digests(&mut self.digest_index, cached, record_idx);
self.records.push(MmapRecordInfo {
mmap_idx,
metadata,
fai_offset: fai_rec.offset(),
fai_line_bases: fai_rec.line_bases(),
fai_line_width: fai_rec.line_width(),
});
}
Ok(summaries)
}
fn extract_bases(&self, info: &MmapRecordInfo, start: u64, end: u64) -> Vec<u8> {
let mmap = &self.mmaps[info.mmap_idx];
let len = (end - start) as usize;
let mut result = Vec::with_capacity(len);
let mut pos = start;
while pos < end {
let line_idx = pos / info.fai_line_bases;
let col = pos % info.fai_line_bases;
let line_start = info.fai_offset + line_idx * info.fai_line_width;
let remaining_on_line = info.fai_line_bases - col;
let to_read = remaining_on_line.min(end - pos) as usize;
let byte_start = (line_start + col) as usize;
for &b in &mmap[byte_start..byte_start + to_read] {
result.push(b.to_ascii_uppercase());
}
pos += to_read as u64;
}
result
}
}
impl Default for MmapSequenceStore {
fn default() -> Self {
Self::new()
}
}
impl SequenceStore for MmapSequenceStore {
fn get_sequence(
&self,
digest: &str,
start: Option<u64>,
end: Option<u64>,
) -> StoreResult<Option<Vec<u8>>> {
let Some(&record_idx) = self.digest_index.get(digest) else {
return Ok(None);
};
let info = &self.records[record_idx];
let length = info.metadata.length;
let start = start.unwrap_or(0);
let end = end.unwrap_or(length).min(length);
if start >= length {
return Ok(Some(vec![]));
}
Ok(Some(self.extract_bases(info, start, end)))
}
fn get_metadata(&self, digest: &str) -> StoreResult<Option<SequenceMetadata>> {
Ok(self.digest_index.get(digest).map(|&idx| self.records[idx].metadata.clone()))
}
fn get_length(&self, digest: &str) -> StoreResult<Option<u64>> {
Ok(self.digest_index.get(digest).map(|&idx| self.records[idx].metadata.length))
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::TempDir;
fn write_test_fasta_with_cache(dir: &TempDir) -> std::path::PathBuf {
let fasta_path = dir.path().join("test.fa");
let mut f = std::fs::File::create(&fasta_path).unwrap();
writeln!(f, ">seq1").unwrap();
writeln!(f, "ACGT").unwrap();
writeln!(f, "NNNN").unwrap();
writeln!(f, ">seq2").unwrap();
writeln!(f, "TTTT").unwrap();
let fai_path = dir.path().join("test.fa.fai");
let mut fai = std::fs::File::create(&fai_path).unwrap();
writeln!(fai, "seq1\t8\t6\t4\t5").unwrap();
writeln!(fai, "seq2\t4\t22\t4\t5").unwrap();
let cache = DigestCache::from_fasta(&fasta_path).unwrap();
cache.write(&fasta_path).unwrap();
fasta_path
}
#[test]
fn test_mmap_load_and_retrieve() {
let dir = TempDir::new().unwrap();
let fasta_path = write_test_fasta_with_cache(&dir);
let mut store = MmapSequenceStore::new();
let summaries = store.add_fasta(&fasta_path).unwrap();
assert_eq!(summaries.len(), 2);
let seq = store.get_sequence(&summaries[0].sha512t24u, None, None).unwrap().unwrap();
assert_eq!(seq, b"ACGTNNNN");
let seq = store.get_sequence(&summaries[1].sha512t24u, None, None).unwrap().unwrap();
assert_eq!(seq, b"TTTT");
}
#[test]
fn test_mmap_subsequence() {
let dir = TempDir::new().unwrap();
let fasta_path = write_test_fasta_with_cache(&dir);
let mut store = MmapSequenceStore::new();
let summaries = store.add_fasta(&fasta_path).unwrap();
let seq = store.get_sequence(&summaries[0].sha512t24u, Some(1), Some(3)).unwrap().unwrap();
assert_eq!(seq, b"CG");
let seq = store.get_sequence(&summaries[0].sha512t24u, Some(2), Some(6)).unwrap().unwrap();
assert_eq!(seq, b"GTNN");
}
#[test]
fn test_mmap_metadata() {
let dir = TempDir::new().unwrap();
let fasta_path = write_test_fasta_with_cache(&dir);
let mut store = MmapSequenceStore::new();
let summaries = store.add_fasta(&fasta_path).unwrap();
let meta = store.get_metadata(&summaries[0].sha512t24u).unwrap().unwrap();
assert_eq!(meta.length, 8);
assert!(meta.sha512t24u.starts_with("SQ."));
}
#[test]
fn test_mmap_get_length() {
let dir = TempDir::new().unwrap();
let fasta_path = write_test_fasta_with_cache(&dir);
let mut store = MmapSequenceStore::new();
let summaries = store.add_fasta(&fasta_path).unwrap();
assert_eq!(store.get_length(&summaries[0].sha512t24u).unwrap(), Some(8));
assert_eq!(store.get_length(&summaries[1].sha512t24u).unwrap(), Some(4));
assert_eq!(store.get_length("nonexistent").unwrap(), None);
}
#[test]
fn test_mmap_not_found() {
let store = MmapSequenceStore::new();
assert!(store.get_sequence("missing", None, None).unwrap().is_none());
assert!(store.get_metadata("missing").unwrap().is_none());
}
#[test]
fn test_mmap_start_beyond_length() {
let dir = TempDir::new().unwrap();
let fasta_path = write_test_fasta_with_cache(&dir);
let mut store = MmapSequenceStore::new();
let summaries = store.add_fasta(&fasta_path).unwrap();
let seq = store.get_sequence(&summaries[0].sha512t24u, Some(100), None).unwrap().unwrap();
assert!(seq.is_empty());
}
#[test]
fn test_mmap_requires_cache() {
let dir = TempDir::new().unwrap();
let fasta_path = dir.path().join("nocache.fa");
let mut f = std::fs::File::create(&fasta_path).unwrap();
writeln!(f, ">seq1\nACGT").unwrap();
let mut fai = std::fs::File::create(dir.path().join("nocache.fa.fai")).unwrap();
writeln!(fai, "seq1\t4\t6\t4\t5").unwrap();
let mut store = MmapSequenceStore::new();
let err = store.add_fasta(&fasta_path).unwrap_err();
let msg = format!("{err}");
assert!(msg.contains("requires a fresh .refget.json"), "Unexpected: {msg}");
}
#[test]
fn test_mmap_multiple_fastas() {
let dir = TempDir::new().unwrap();
let fa1 = dir.path().join("a.fa");
let mut f = std::fs::File::create(&fa1).unwrap();
writeln!(f, ">s1\nAAAA").unwrap();
let mut fai = std::fs::File::create(dir.path().join("a.fa.fai")).unwrap();
writeln!(fai, "s1\t4\t4\t4\t5").unwrap();
DigestCache::from_fasta(&fa1).unwrap().write(&fa1).unwrap();
let fa2 = dir.path().join("b.fa");
let mut f = std::fs::File::create(&fa2).unwrap();
writeln!(f, ">s2\nCCCC").unwrap();
let mut fai = std::fs::File::create(dir.path().join("b.fa.fai")).unwrap();
writeln!(fai, "s2\t4\t4\t4\t5").unwrap();
DigestCache::from_fasta(&fa2).unwrap().write(&fa2).unwrap();
let mut store = MmapSequenceStore::new();
let s1 = store.add_fasta(&fa1).unwrap();
let s2 = store.add_fasta(&fa2).unwrap();
let seq1 = store.get_sequence(&s1[0].sha512t24u, None, None).unwrap().unwrap();
assert_eq!(seq1, b"AAAA");
let seq2 = store.get_sequence(&s2[0].sha512t24u, None, None).unwrap().unwrap();
assert_eq!(seq2, b"CCCC");
}
#[test]
fn test_mmap_lowercase_uppercased() {
let dir = TempDir::new().unwrap();
let fasta_path = dir.path().join("lower.fa");
let mut f = std::fs::File::create(&fasta_path).unwrap();
writeln!(f, ">seq1\nacgt").unwrap();
let mut fai = std::fs::File::create(dir.path().join("lower.fa.fai")).unwrap();
writeln!(fai, "seq1\t4\t6\t4\t5").unwrap();
DigestCache::from_fasta(&fasta_path).unwrap().write(&fasta_path).unwrap();
let mut store = MmapSequenceStore::new();
let summaries = store.add_fasta(&fasta_path).unwrap();
let seq = store.get_sequence(&summaries[0].sha512t24u, None, None).unwrap().unwrap();
assert_eq!(seq, b"ACGT");
}
#[test]
fn test_mmap_matches_memory_store() {
let dir = TempDir::new().unwrap();
let fasta_path = write_test_fasta_with_cache(&dir);
let (mem_store, mem_summaries) =
crate::fasta::FastaSequenceStore::from_fasta(&fasta_path).unwrap();
let mut mmap_store = MmapSequenceStore::new();
let mmap_summaries = mmap_store.add_fasta(&fasta_path).unwrap();
assert_eq!(mem_summaries.len(), mmap_summaries.len());
for (a, b) in mem_summaries.iter().zip(mmap_summaries.iter()) {
assert_eq!(a.name, b.name);
assert_eq!(a.length, b.length);
assert_eq!(a.md5, b.md5);
assert_eq!(a.sha512t24u, b.sha512t24u);
}
for s in &mem_summaries {
let mem_seq = mem_store.get_sequence(&s.sha512t24u, None, None).unwrap().unwrap();
let mmap_seq = mmap_store.get_sequence(&s.sha512t24u, None, None).unwrap().unwrap();
assert_eq!(mem_seq, mmap_seq, "Mismatch for {}", s.name);
}
}
}