use crate::errors::FgumiError;
use anyhow::{Context, Result};
use log::debug;
use noodles::core::Position;
use noodles::fasta::fai;
use std::collections::HashMap;
use std::fs::File;
use std::io::{Read, Seek, SeekFrom};
use std::path::{Path, PathBuf};
use std::sync::Arc;
#[allow(clippy::cast_possible_truncation)]
fn read_sequence_raw(file: &mut File, record: &fai::Record) -> Result<Vec<u8>> {
let line_bases = record.line_bases() as usize;
let line_width = record.line_width() as usize;
let seq_len = record.length() as usize;
let offset = record.offset();
if seq_len <= line_bases {
file.seek(SeekFrom::Start(offset))?;
let mut sequence = vec![0u8; seq_len];
file.read_exact(&mut sequence)?;
return Ok(sequence);
}
let complete_lines = seq_len / line_bases;
let remaining_bases = seq_len % line_bases;
let total_bytes = if remaining_bases > 0 {
complete_lines * line_width + remaining_bases
} else if complete_lines > 0 {
(complete_lines - 1) * line_width + line_bases
} else {
0
};
file.seek(SeekFrom::Start(offset))?;
let mut raw_bytes = vec![0u8; total_bytes];
file.read_exact(&mut raw_bytes)?;
let mut sequence = Vec::with_capacity(seq_len);
let terminator_len = line_width - line_bases;
let mut pos = 0;
while sequence.len() < seq_len && pos < raw_bytes.len() {
let bases_to_copy = (seq_len - sequence.len()).min(line_bases).min(raw_bytes.len() - pos);
sequence.extend_from_slice(&raw_bytes[pos..pos + bases_to_copy]);
pos += bases_to_copy;
if sequence.len() < seq_len && pos < raw_bytes.len() {
pos += terminator_len;
}
}
Ok(sequence)
}
fn find_sibling_file(fasta_path: &Path, replace_ext: &str, append_ext: &str) -> Option<PathBuf> {
let replaced = fasta_path.with_extension(replace_ext);
if replaced.exists() {
return Some(replaced);
}
let appended = PathBuf::from(format!("{}.{append_ext}", fasta_path.display()));
if appended.exists() {
return Some(appended);
}
None
}
fn find_fai_path(fasta_path: &Path) -> Option<PathBuf> {
find_sibling_file(fasta_path, "fa.fai", "fai")
}
#[must_use]
pub fn find_dict_path(fasta_path: &Path) -> Option<PathBuf> {
find_sibling_file(fasta_path, "dict", "dict")
}
#[derive(Clone)]
pub struct ReferenceReader {
sequences: Arc<HashMap<String, Vec<u8>>>,
}
impl ReferenceReader {
pub fn new<P: AsRef<Path>>(path: P) -> Result<Self> {
let path = path.as_ref();
if !path.exists() {
return Err(FgumiError::InvalidFileFormat {
file_type: "Reference FASTA".to_string(),
path: path.display().to_string(),
reason: "File does not exist".to_string(),
}
.into());
}
debug!("Reading reference FASTA into memory: {}", path.display());
if let Some(fai_path) = find_fai_path(path) {
debug!("Using FAI index for fast loading: {}", fai_path.display());
return Self::new_with_fai(path, &fai_path);
}
debug!("No FAI index found, using sequential reading");
Self::new_sequential(path)
}
fn new_with_fai(fasta_path: &Path, fai_path: &Path) -> Result<Self> {
let index = fai::fs::read(fai_path)
.with_context(|| format!("Failed to read FAI index: {}", fai_path.display()))?;
let records: &[fai::Record] = index.as_ref();
let mut file = File::open(fasta_path)
.with_context(|| format!("Failed to open FASTA: {}", fasta_path.display()))?;
let mut sequences = HashMap::with_capacity(records.len());
for record in records {
let raw_sequence = read_sequence_raw(&mut file, record)?;
let name = String::from_utf8_lossy(record.name().as_ref()).into_owned();
sequences.insert(name, raw_sequence);
}
debug!("Loaded {} contigs into memory (FAI-indexed)", sequences.len());
Ok(Self { sequences: Arc::new(sequences) })
}
fn new_sequential(path: &Path) -> Result<Self> {
use noodles::fasta;
let mut sequences = HashMap::new();
let mut reader = fasta::io::reader::Builder.build_from_path(path)?;
for result in reader.records() {
let record = result?;
let name = std::str::from_utf8(record.name())?.to_string();
let raw_sequence: Vec<u8> = record.sequence().as_ref().to_vec();
sequences.insert(name, raw_sequence);
}
debug!("Loaded {} contigs into memory (sequential)", sequences.len());
Ok(Self { sequences: Arc::new(sequences) })
}
pub fn fetch(&self, chrom: &str, start: Position, end: Position) -> Result<Vec<u8>> {
Ok(self.fetch_slice(chrom, start, end)?.to_vec())
}
pub fn fetch_slice(&self, chrom: &str, start: Position, end: Position) -> Result<&[u8]> {
let sequence = self
.sequences
.get(chrom)
.ok_or_else(|| FgumiError::ReferenceNotFound { ref_name: chrom.to_string() })?;
let start_idx = usize::from(start) - 1;
let end_idx = usize::from(end);
if end_idx > sequence.len() || start_idx >= end_idx {
return Err(FgumiError::InvalidParameter {
parameter: "region".to_string(),
reason: format!(
"Requested region {}:{}-{} exceeds sequence length {}",
chrom,
start,
end,
sequence.len()
),
}
.into());
}
Ok(&sequence[start_idx..end_idx])
}
pub fn base_at(&self, chrom: &str, pos: Position) -> Result<u8> {
let sequence = self
.sequences
.get(chrom)
.ok_or_else(|| FgumiError::ReferenceNotFound { ref_name: chrom.to_string() })?;
let pos_idx = usize::from(pos) - 1;
sequence.get(pos_idx).copied().ok_or_else(|| {
FgumiError::InvalidParameter {
parameter: "position".to_string(),
reason: format!(
"Position {}:{} exceeds sequence length {}",
chrom,
pos,
sequence.len()
),
}
.into()
})
}
}
impl fgumi_sam::ReferenceProvider for ReferenceReader {
fn fetch(
&self,
chrom: &str,
start: noodles::core::Position,
end: noodles::core::Position,
) -> anyhow::Result<Vec<u8>> {
self.fetch(chrom, start, end)
}
}
#[cfg(feature = "simplex")]
impl fgumi_consensus::methylation::RefBaseProvider for ReferenceReader {
fn base_at_0based(&self, chrom: &str, pos: u64) -> Option<u8> {
let sequence = self.sequences.get(chrom)?;
sequence.get(usize::try_from(pos).ok()?).copied()
}
fn sequence_for(&self, chrom: &str) -> Option<&[u8]> {
self.sequences.get(chrom).map(Vec::as_slice)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::sam::builder::create_default_test_fasta;
#[test]
fn test_fetch_subsequence() -> Result<()> {
let fasta = create_default_test_fasta()?;
let reader = ReferenceReader::new(fasta.path())?;
let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?;
assert_eq!(seq, b"ACGT");
let seq = reader.fetch("chr2", Position::try_from(5)?, Position::try_from(8)?)?;
assert_eq!(seq, b"CCCC");
Ok(())
}
#[test]
fn test_fetch_slice_returns_borrowed_bytes() -> Result<()> {
let fasta = create_default_test_fasta()?;
let reader = ReferenceReader::new(fasta.path())?;
let slice: &[u8] =
reader.fetch_slice("chr1", Position::try_from(1)?, Position::try_from(4)?)?;
assert_eq!(slice, b"ACGT");
let owned = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?;
assert_eq!(slice, owned.as_slice());
assert!(
reader
.fetch_slice("chr1", Position::try_from(1)?, Position::try_from(10_000)?)
.is_err()
);
assert!(
reader.fetch_slice("nope", Position::try_from(1)?, Position::try_from(2)?).is_err()
);
assert!(
reader.fetch_slice("chr1", Position::try_from(5)?, Position::try_from(4)?).is_err()
);
Ok(())
}
#[test]
fn test_base_at() -> Result<()> {
let fasta = create_default_test_fasta()?;
let reader = ReferenceReader::new(fasta.path())?;
assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'A');
assert_eq!(reader.base_at("chr1", Position::try_from(2)?)?, b'C');
assert_eq!(reader.base_at("chr2", Position::try_from(1)?)?, b'G');
Ok(())
}
#[test]
fn test_all_sequences_loaded() -> Result<()> {
let fasta = create_default_test_fasta()?;
let reader = ReferenceReader::new(fasta.path())?;
let seq1 = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?;
assert_eq!(seq1, b"ACGT");
let seq2 = reader.fetch("chr2", Position::try_from(1)?, Position::try_from(4)?)?;
assert_eq!(seq2, b"GGGG");
let seq1_again = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?;
assert_eq!(seq1_again, b"ACGT");
Ok(())
}
#[test]
fn test_nonexistent_sequence() {
let fasta = create_default_test_fasta().expect("creating test FASTA should succeed");
let reader =
ReferenceReader::new(fasta.path()).expect("creating reference reader should succeed");
let result = reader.fetch(
"chr999",
Position::try_from(1).expect("position conversion should succeed"),
Position::try_from(4).expect("position conversion should succeed"),
);
assert!(result.is_err());
}
#[test]
fn test_out_of_bounds() {
let fasta = create_default_test_fasta().expect("creating test FASTA should succeed");
let reader =
ReferenceReader::new(fasta.path()).expect("creating reference reader should succeed");
let result = reader.fetch(
"chr1",
Position::try_from(1).expect("position conversion should succeed"),
Position::try_from(100).expect("position conversion should succeed"),
);
assert!(result.is_err());
}
#[test]
fn test_reference_case_preserved() -> Result<()> {
use crate::sam::builder::create_test_fasta;
let file = create_test_fasta(&[("chr1", "AcGtNnAaCcGgTt")])?;
let reader = ReferenceReader::new(file.path())?;
let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(14)?)?;
assert_eq!(seq, b"AcGtNnAaCcGgTt");
assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'A'); assert_eq!(reader.base_at("chr1", Position::try_from(2)?)?, b'c'); assert_eq!(reader.base_at("chr1", Position::try_from(3)?)?, b'G'); assert_eq!(reader.base_at("chr1", Position::try_from(4)?)?, b't'); assert_eq!(reader.base_at("chr1", Position::try_from(5)?)?, b'N'); assert_eq!(reader.base_at("chr1", Position::try_from(6)?)?, b'n');
Ok(())
}
#[test]
fn test_n_bases_at_various_positions() -> Result<()> {
use crate::sam::builder::create_test_fasta;
let file = create_test_fasta(&[("chr1", "NACGTNACGTN")])?;
let reader = ReferenceReader::new(file.path())?;
assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'N');
assert_eq!(reader.base_at("chr1", Position::try_from(6)?)?, b'N');
assert_eq!(reader.base_at("chr1", Position::try_from(11)?)?, b'N');
let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(6)?)?;
assert_eq!(seq, b"NACGTN");
Ok(())
}
#[test]
fn test_all_n_sequence() -> Result<()> {
use crate::sam::builder::create_test_fasta;
let file = create_test_fasta(&[("chrN", "NNNNNNNNNN")])?;
let reader = ReferenceReader::new(file.path())?;
let seq = reader.fetch("chrN", Position::try_from(1)?, Position::try_from(10)?)?;
assert_eq!(seq, b"NNNNNNNNNN");
for i in 1..=10 {
assert_eq!(reader.base_at("chrN", Position::try_from(i)?)?, b'N');
}
Ok(())
}
#[test]
fn test_long_sequence_boundaries() -> Result<()> {
use crate::sam::builder::create_test_fasta;
let mut seq = String::new();
for _ in 0..7 {
seq.push_str("ACGT");
}
seq.push_str("NNNN");
for _ in 0..7 {
seq.push_str("ACGT");
}
seq.push_str("acgt");
for _ in 0..9 {
seq.push_str("ACGT");
}
assert_eq!(seq.len(), 100);
let file = create_test_fasta(&[("chr1", &seq)])?;
let reader = ReferenceReader::new(file.path())?;
assert_eq!(reader.base_at("chr1", Position::try_from(28)?)?, b'T');
assert_eq!(reader.base_at("chr1", Position::try_from(29)?)?, b'N');
assert_eq!(reader.base_at("chr1", Position::try_from(32)?)?, b'N');
assert_eq!(reader.base_at("chr1", Position::try_from(33)?)?, b'A');
assert_eq!(reader.base_at("chr1", Position::try_from(60)?)?, b'T');
assert_eq!(reader.base_at("chr1", Position::try_from(61)?)?, b'a');
assert_eq!(reader.base_at("chr1", Position::try_from(64)?)?, b't');
assert_eq!(reader.base_at("chr1", Position::try_from(65)?)?, b'A');
let cross_first = reader.fetch("chr1", Position::try_from(27)?, Position::try_from(34)?)?;
assert_eq!(cross_first, b"GTNNNNAC");
let cross_second =
reader.fetch("chr1", Position::try_from(59)?, Position::try_from(66)?)?;
assert_eq!(cross_second, b"GTacgtAC");
Ok(())
}
#[test]
fn test_single_base_sequence() -> Result<()> {
use crate::sam::builder::create_test_fasta;
let file = create_test_fasta(&[("chr1", "A"), ("chr2", "N"), ("chr3", "g")])?;
let reader = ReferenceReader::new(file.path())?;
assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'A');
assert_eq!(reader.base_at("chr2", Position::try_from(1)?)?, b'N');
assert_eq!(reader.base_at("chr3", Position::try_from(1)?)?, b'g');
let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(1)?)?;
assert_eq!(seq, b"A");
Ok(())
}
#[test]
fn test_fetch_full_sequence() -> Result<()> {
use crate::sam::builder::create_test_fasta;
let original = "ACGTNacgtn";
let file = create_test_fasta(&[("chr1", original)])?;
let reader = ReferenceReader::new(file.path())?;
let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(10)?)?;
assert_eq!(seq, original.as_bytes());
Ok(())
}
#[test]
fn test_fetch_last_base() -> Result<()> {
use crate::sam::builder::create_test_fasta;
let file = create_test_fasta(&[("chr1", "ACGTN")])?;
let reader = ReferenceReader::new(file.path())?;
let seq = reader.fetch("chr1", Position::try_from(5)?, Position::try_from(5)?)?;
assert_eq!(seq, b"N");
assert_eq!(reader.base_at("chr1", Position::try_from(5)?)?, b'N');
Ok(())
}
#[test]
fn test_multiple_chromosomes_isolation() -> Result<()> {
use crate::sam::builder::create_test_fasta;
let file = create_test_fasta(&[
("chr1", "AAAA"),
("chr2", "CCCC"),
("chr3", "GGGG"),
("chr4", "TTTT"),
])?;
let reader = ReferenceReader::new(file.path())?;
assert_eq!(reader.fetch("chr1", Position::try_from(1)?, Position::try_from(4)?)?, b"AAAA");
assert_eq!(reader.fetch("chr2", Position::try_from(1)?, Position::try_from(4)?)?, b"CCCC");
assert_eq!(reader.fetch("chr3", Position::try_from(1)?, Position::try_from(4)?)?, b"GGGG");
assert_eq!(reader.fetch("chr4", Position::try_from(1)?, Position::try_from(4)?)?, b"TTTT");
Ok(())
}
#[test]
fn test_mixed_case_all_bases() -> Result<()> {
use crate::sam::builder::create_test_fasta;
let file = create_test_fasta(&[("chr1", "ACGTNacgtn")])?;
let reader = ReferenceReader::new(file.path())?;
let expected = [b'A', b'C', b'G', b'T', b'N', b'a', b'c', b'g', b't', b'n'];
for (i, &expected_base) in expected.iter().enumerate() {
let pos = Position::try_from(i + 1)?;
assert_eq!(
reader.base_at("chr1", pos)?,
expected_base,
"Mismatch at position {}",
i + 1
);
}
Ok(())
}
#[test]
fn test_runs_of_n_bases() -> Result<()> {
use crate::sam::builder::create_test_fasta;
let file = create_test_fasta(&[("chr1", "ACGTNNNNNNNNACGT")])?;
let reader = ReferenceReader::new(file.path())?;
let n_run = reader.fetch("chr1", Position::try_from(5)?, Position::try_from(12)?)?;
assert_eq!(n_run, b"NNNNNNNN");
let across = reader.fetch("chr1", Position::try_from(3)?, Position::try_from(14)?)?;
assert_eq!(across, b"GTNNNNNNNNAC");
Ok(())
}
#[test]
fn test_position_one_based() -> Result<()> {
use crate::sam::builder::create_test_fasta;
let file = create_test_fasta(&[("chr1", "ACGTN")])?;
let reader = ReferenceReader::new(file.path())?;
assert_eq!(reader.base_at("chr1", Position::try_from(1)?)?, b'A');
assert_eq!(reader.base_at("chr1", Position::try_from(2)?)?, b'C');
let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(1)?)?;
assert_eq!(seq, b"A");
let seq = reader.fetch("chr1", Position::try_from(1)?, Position::try_from(2)?)?;
assert_eq!(seq, b"AC");
Ok(())
}
#[test]
fn test_find_dict_path_replacing_convention() -> Result<()> {
let temp_dir = tempfile::tempdir()?;
let fasta_path = temp_dir.path().join("ref.fa");
let dict_path = temp_dir.path().join("ref.dict");
std::fs::write(&fasta_path, "")?;
std::fs::write(&dict_path, "")?;
let found = find_dict_path(&fasta_path);
assert!(found.is_some());
assert_eq!(found.expect("should find dictionary path"), dict_path);
Ok(())
}
#[test]
fn test_find_dict_path_appending_convention() -> Result<()> {
let temp_dir = tempfile::tempdir()?;
let fasta_path = temp_dir.path().join("ref.fa");
let dict_path = temp_dir.path().join("ref.fa.dict");
std::fs::write(&fasta_path, "")?;
std::fs::write(&dict_path, "")?;
let found = find_dict_path(&fasta_path);
assert!(found.is_some());
assert_eq!(found.expect("should find dictionary path"), dict_path);
Ok(())
}
#[test]
fn test_find_dict_path_prefers_replacing_convention() -> Result<()> {
let temp_dir = tempfile::tempdir()?;
let fasta_path = temp_dir.path().join("ref.fa");
let dict_replacing = temp_dir.path().join("ref.dict");
let dict_appending = temp_dir.path().join("ref.fa.dict");
std::fs::write(&fasta_path, "")?;
std::fs::write(&dict_replacing, "")?;
std::fs::write(&dict_appending, "")?;
let found = find_dict_path(&fasta_path);
assert!(found.is_some());
assert_eq!(found.expect("should find dictionary path"), dict_replacing);
Ok(())
}
#[test]
fn test_find_dict_path_not_found() {
let temp_dir = tempfile::tempdir().expect("creating temp file/dir should succeed");
let fasta_path = temp_dir.path().join("ref.fa");
std::fs::write(&fasta_path, "").expect("writing file should succeed");
let found = find_dict_path(&fasta_path);
assert!(found.is_none());
}
#[test]
fn test_find_dict_path_fasta_extension() -> Result<()> {
let temp_dir = tempfile::tempdir()?;
let fasta_path = temp_dir.path().join("ref.fasta");
let dict_path = temp_dir.path().join("ref.dict");
std::fs::write(&fasta_path, "")?;
std::fs::write(&dict_path, "")?;
let found = find_dict_path(&fasta_path);
assert!(found.is_some());
assert_eq!(found.expect("should find dictionary path"), dict_path);
Ok(())
}
#[test]
fn test_find_dict_path_fasta_appending_convention() -> Result<()> {
let temp_dir = tempfile::tempdir()?;
let fasta_path = temp_dir.path().join("ref.fasta");
let dict_path = temp_dir.path().join("ref.fasta.dict");
std::fs::write(&fasta_path, "")?;
std::fs::write(&dict_path, "")?;
let found = find_dict_path(&fasta_path);
assert!(found.is_some());
assert_eq!(found.expect("should find dictionary path"), dict_path);
Ok(())
}
}