use std::collections::HashMap;
use std::fmt;
use std::path::Path;
use std::sync::Mutex;
use anyhow::{anyhow, bail, Context, Result};
use noodles_core::Position;
use noodles_fasta::io::indexed_reader::Builder;
use noodles_fasta::io::IndexedReader;
use crate::core::types::Region;
pub struct ReferenceGenome {
path: std::path::PathBuf,
reader: Mutex<IndexedReader<noodles_fasta::io::BufReader<std::fs::File>>>,
chrom_lengths: HashMap<String, u64>,
}
impl fmt::Debug for ReferenceGenome {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("ReferenceGenome")
.field("chrom_lengths", &self.chrom_lengths)
.finish_non_exhaustive()
}
}
impl ReferenceGenome {
pub fn open(path: &Path) -> Result<Self> {
if !path.exists() {
bail!("FASTA file not found: {}", path.display());
}
let index_path = {
let mut p = path.as_os_str().to_owned();
p.push(".fai");
std::path::PathBuf::from(p)
};
if !index_path.exists() {
bail!(
"FASTA index not found: {} (run `samtools faidx {}` to create it)",
index_path.display(),
path.display()
);
}
let reader = Builder::default()
.build_from_path(path)
.with_context(|| format!("failed to open indexed FASTA: {}", path.display()))?;
let chrom_lengths: HashMap<String, u64> = reader
.index()
.as_ref()
.iter()
.map(|rec| {
let name = String::from_utf8_lossy(rec.name()).into_owned();
(name, rec.length())
})
.collect();
Ok(Self {
path: path.to_path_buf(),
reader: Mutex::new(reader),
chrom_lengths,
})
}
pub fn sequence(&self, region: &Region) -> Result<Vec<u8>> {
let chrom_len = self
.chrom_lengths
.get(®ion.chrom)
.ok_or_else(|| anyhow!("chromosome '{}' not found in reference index", region.chrom))?;
if region.end > *chrom_len {
bail!(
"region {}:{}-{} is out of bounds (chromosome length = {})",
region.chrom,
region.start,
region.end,
chrom_len
);
}
if region.is_empty() {
return Ok(Vec::new());
}
let start1: usize = usize::try_from(region.start + 1).map_err(|e| {
anyhow!(
"start position {} is too large for this platform: {}",
region.start,
e
)
})?;
let end1: usize = usize::try_from(region.end).map_err(|e| {
anyhow!(
"end position {} is too large for this platform: {}",
region.end,
e
)
})?;
let noodles_start = Position::try_from(start1)
.map_err(|e| anyhow!("invalid start position {}: {}", region.start, e))?;
let noodles_end = Position::try_from(end1)
.map_err(|e| anyhow!("invalid end position {}: {}", region.end, e))?;
let noodles_region =
noodles_core::Region::new(region.chrom.as_bytes(), noodles_start..=noodles_end);
let record = self
.reader
.lock()
.map_err(|e| anyhow!("reference reader lock poisoned: {}", e))?
.query(&noodles_region)
.with_context(|| {
format!(
"failed to query region {}:{}-{}",
region.chrom, region.start, region.end
)
})?;
let seq: Vec<u8> = record
.sequence()
.as_ref()
.iter()
.map(|b| b.to_ascii_uppercase())
.collect();
Ok(seq)
}
pub fn chromosome_lengths(&self) -> &HashMap<String, u64> {
&self.chrom_lengths
}
pub fn chrom_len(&self, name: &str) -> Option<u64> {
self.chrom_lengths.get(name).copied()
}
#[cfg(test)]
pub fn contains_chromosome(&self, name: &str) -> bool {
self.chrom_lengths.contains_key(name)
}
#[cfg(test)]
pub fn genome_size(&self) -> u64 {
self.chrom_lengths.values().sum()
}
}
impl Clone for ReferenceGenome {
fn clone(&self) -> Self {
Self::open(&self.path).expect("failed to reopen reference genome for parallel worker")
}
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
fn write_test_fasta(dir: &TempDir) -> std::path::PathBuf {
let fa_path = dir.path().join("test.fa");
let fai_path = dir.path().join("test.fa.fai");
let fa_content = b">chr1\nACGTACGTACGTACGT\n>chr2\nNNNNacgtNNNN\n";
std::fs::write(&fa_path, fa_content).unwrap();
let fai_content = b"chr1\t16\t6\t16\t17\nchr2\t12\t29\t12\t13\n";
std::fs::write(&fai_path, fai_content).unwrap();
fa_path
}
#[test]
fn test_open_missing_file() {
let result = ReferenceGenome::open(Path::new("/nonexistent/path/genome.fa"));
assert!(result.is_err(), "expected error for missing FASTA file");
let msg = result.unwrap_err().to_string();
assert!(
msg.contains("not found"),
"error message should mention 'not found': {msg}"
);
}
#[test]
fn test_open_missing_index() {
let dir = TempDir::new().unwrap();
let fa_path = dir.path().join("genome.fa");
std::fs::write(&fa_path, b">chr1\nACGT\n").unwrap();
let result = ReferenceGenome::open(&fa_path);
assert!(result.is_err(), "expected error when .fai is absent");
let msg = result.unwrap_err().to_string();
assert!(
msg.contains("index not found") || msg.contains("not found"),
"error should mention missing index: {msg}"
);
}
#[test]
fn test_sequence_extraction() {
let dir = TempDir::new().unwrap();
let fa = write_test_fasta(&dir);
let genome = ReferenceGenome::open(&fa).unwrap();
let seq = genome.sequence(&Region::new("chr1", 0, 4)).unwrap();
assert_eq!(seq, b"ACGT");
let seq2 = genome.sequence(&Region::new("chr1", 4, 8)).unwrap();
assert_eq!(seq2, b"ACGT");
let seq3 = genome.sequence(&Region::new("chr1", 0, 16)).unwrap();
assert_eq!(seq3, b"ACGTACGTACGTACGT");
}
#[test]
fn test_sequence_uppercase() {
let dir = TempDir::new().unwrap();
let fa = write_test_fasta(&dir);
let genome = ReferenceGenome::open(&fa).unwrap();
let seq = genome.sequence(&Region::new("chr2", 4, 8)).unwrap();
assert_eq!(seq, b"ACGT", "lowercase bases must be uppercased");
}
#[test]
fn test_sequence_with_n_bases() {
let dir = TempDir::new().unwrap();
let fa = write_test_fasta(&dir);
let genome = ReferenceGenome::open(&fa).unwrap();
let seq = genome.sequence(&Region::new("chr2", 0, 4)).unwrap();
assert_eq!(seq, b"NNNN");
}
#[test]
fn test_chromosome_lengths() {
let dir = TempDir::new().unwrap();
let fa = write_test_fasta(&dir);
let genome = ReferenceGenome::open(&fa).unwrap();
let lengths = genome.chromosome_lengths();
assert_eq!(lengths.get("chr1"), Some(&16u64));
assert_eq!(lengths.get("chr2"), Some(&12u64));
}
#[test]
fn test_out_of_bounds() {
let dir = TempDir::new().unwrap();
let fa = write_test_fasta(&dir);
let genome = ReferenceGenome::open(&fa).unwrap();
let result = genome.sequence(&Region::new("chr1", 0, 17));
assert!(result.is_err(), "expected out-of-bounds error");
let msg = result.unwrap_err().to_string();
assert!(
msg.contains("out of bounds"),
"error should mention out-of-bounds: {msg}"
);
}
#[test]
fn test_contains_chromosome() {
let dir = TempDir::new().unwrap();
let fa = write_test_fasta(&dir);
let genome = ReferenceGenome::open(&fa).unwrap();
assert!(genome.contains_chromosome("chr1"));
assert!(genome.contains_chromosome("chr2"));
assert!(!genome.contains_chromosome("chr3"));
assert!(!genome.contains_chromosome("1")); }
#[test]
fn test_genome_size() {
let dir = TempDir::new().unwrap();
let fa = write_test_fasta(&dir);
let genome = ReferenceGenome::open(&fa).unwrap();
assert_eq!(genome.genome_size(), 28);
}
}