use std::fs::File;
use std::io::{Seek, SeekFrom};
use std::path::Path;
use anyhow::{Context, Result, anyhow};
use noodles::core::Region;
use noodles::fasta;
use rand::Rng;
use crate::sequence_dict::SequenceDictionary;
pub struct Fasta {
reader: fasta::io::IndexedReader<fasta::io::BufReader<File>>,
dict: SequenceDictionary,
}
impl Fasta {
pub fn from_path(path: &Path) -> Result<Self> {
let reader = fasta::io::indexed_reader::Builder::default()
.build_from_path(path)
.with_context(|| format!("Failed to open indexed FASTA: {}", path.display()))?;
let dict = SequenceDictionary::from(reader.index());
Ok(Self { reader, dict })
}
#[must_use]
pub fn dict(&self) -> &SequenceDictionary {
&self.dict
}
pub fn load_contig(&mut self, contig_name: &str, rng: &mut impl Rng) -> Result<Vec<u8>> {
let meta = self
.dict
.get_by_name(contig_name)
.ok_or_else(|| anyhow!("Contig '{contig_name}' not found in reference"))?;
let expected_len = meta.length();
let region = Region::new(contig_name, ..);
let pos =
self.reader.index().query(®ion).with_context(|| {
format!("Failed to look up contig '{contig_name}' in FASTA index")
})?;
self.reader
.get_mut()
.seek(SeekFrom::Start(pos))
.with_context(|| format!("Failed to seek to contig '{contig_name}' in FASTA"))?;
let mut seq = Vec::with_capacity(expected_len);
self.reader
.read_sequence(&mut seq)
.with_context(|| format!("Failed to read contig '{contig_name}' from FASTA"))?;
normalize_bases(&mut seq, contig_name, rng)?;
Ok(seq)
}
#[must_use]
pub fn contig_names(&self) -> Vec<&str> {
self.dict.names()
}
}
fn resolve_iupac(code: u8, rng: &mut impl Rng) -> Option<u8> {
let alts: &[u8] = match code {
b'R' => b"AG",
b'Y' => b"CT",
b'S' => b"CG",
b'W' => b"AT",
b'K' => b"GT",
b'M' => b"AC",
b'B' => b"CGT",
b'D' => b"AGT",
b'H' => b"ACT",
b'V' => b"ACG",
b'N' => b"ACGT",
_ => return None,
};
Some(alts[rng.random_range(0..alts.len())])
}
fn normalize_bases(seq: &mut [u8], contig_name: &str, rng: &mut impl Rng) -> Result<()> {
for (i, b) in seq.iter_mut().enumerate() {
let upper = b.to_ascii_uppercase();
match upper {
b'A' | b'C' | b'G' | b'T' => *b = upper,
b'U' => *b = b'T',
b'R' | b'Y' | b'S' | b'W' | b'K' | b'M' | b'B' | b'D' | b'H' | b'V' | b'N' => {
let picked = resolve_iupac(upper, rng).expect("code is a valid IUPAC code");
*b = picked.to_ascii_lowercase();
}
_ => {
return Err(anyhow!(
"Contig '{contig_name}' contains unrecognized base {:?} (0x{:02X}) at position {i}",
char::from(*b),
*b
));
}
}
}
Ok(())
}
#[cfg(test)]
#[must_use]
pub fn write_test_fasta(dir: &std::path::Path, contigs: &[(&str, &[u8])]) -> std::path::PathBuf {
use std::io::Write;
let fasta_path = dir.join("ref.fa");
let fai_path = dir.join("ref.fa.fai");
let mut fa = std::fs::File::create(&fasta_path).unwrap();
let mut fai = std::fs::File::create(&fai_path).unwrap();
let mut offset: u64 = 0;
for &(name, seq) in contigs {
let header = format!(">{name}\n");
fa.write_all(header.as_bytes()).unwrap();
offset += header.len() as u64;
let seq_offset = offset;
let line_len = 80;
for chunk in seq.chunks(line_len) {
fa.write_all(chunk).unwrap();
fa.write_all(b"\n").unwrap();
}
let seq_len = seq.len();
let line_bases = line_len;
let line_width = line_len + 1; writeln!(fai, "{name}\t{seq_len}\t{seq_offset}\t{line_bases}\t{line_width}").unwrap();
let full_lines = seq_len / line_len;
let remainder = seq_len % line_len;
offset += (full_lines * line_width) as u64;
if remainder > 0 {
offset += (remainder + 1) as u64;
}
}
fasta_path
}
#[cfg(test)]
mod tests {
use super::*;
use rand::SeedableRng;
use rand::rngs::SmallRng;
fn test_rng() -> SmallRng {
SmallRng::seed_from_u64(42)
}
#[test]
fn test_load_contig_single() {
let dir = tempfile::tempdir().unwrap();
let path = write_test_fasta(dir.path(), &[("chr1", b"ACGTACGT")]);
let mut fasta = Fasta::from_path(&path).unwrap();
assert_eq!(fasta.contig_names(), vec!["chr1"]);
assert_eq!(fasta.dict().len(), 1);
let seq = fasta.load_contig("chr1", &mut test_rng()).unwrap();
assert_eq!(&seq, b"ACGTACGT");
}
#[test]
fn test_load_contig_multiple() {
let dir = tempfile::tempdir().unwrap();
let path = write_test_fasta(
dir.path(),
&[("chr1", b"AAAA"), ("chr2", b"CCCCGGGG"), ("chr3", b"TT")],
);
let mut fasta = Fasta::from_path(&path).unwrap();
assert_eq!(fasta.contig_names(), vec!["chr1", "chr2", "chr3"]);
let seq1 = fasta.load_contig("chr1", &mut test_rng()).unwrap();
assert_eq!(&seq1, b"AAAA");
let seq2 = fasta.load_contig("chr2", &mut test_rng()).unwrap();
assert_eq!(&seq2, b"CCCCGGGG");
let seq3 = fasta.load_contig("chr3", &mut test_rng()).unwrap();
assert_eq!(&seq3, b"TT");
}
#[test]
fn test_load_contig_out_of_order() {
let dir = tempfile::tempdir().unwrap();
let path = write_test_fasta(dir.path(), &[("chr1", b"AAAA"), ("chr2", b"CCCC")]);
let mut fasta = Fasta::from_path(&path).unwrap();
let seq2 = fasta.load_contig("chr2", &mut test_rng()).unwrap();
assert_eq!(&seq2, b"CCCC");
let seq1 = fasta.load_contig("chr1", &mut test_rng()).unwrap();
assert_eq!(&seq1, b"AAAA");
}
#[test]
fn test_load_contig_uppercases_acgt() {
let dir = tempfile::tempdir().unwrap();
let path = write_test_fasta(dir.path(), &[("chr1", b"acgtACGT")]);
let mut fasta = Fasta::from_path(&path).unwrap();
let seq = fasta.load_contig("chr1", &mut test_rng()).unwrap();
assert_eq!(&seq, b"ACGTACGT");
}
#[test]
fn test_load_contig_u_to_t() {
let dir = tempfile::tempdir().unwrap();
let path = write_test_fasta(dir.path(), &[("chr1", b"AUGCUu")]);
let mut fasta = Fasta::from_path(&path).unwrap();
let seq = fasta.load_contig("chr1", &mut test_rng()).unwrap();
assert_eq!(&seq, b"ATGCTT");
}
#[test]
fn test_load_contig_unknown() {
let dir = tempfile::tempdir().unwrap();
let path = write_test_fasta(dir.path(), &[("chr1", b"ACGT")]);
let mut fasta = Fasta::from_path(&path).unwrap();
let result = fasta.load_contig("chrZ", &mut test_rng());
assert!(result.is_err());
assert!(result.unwrap_err().to_string().contains("not found"));
}
#[test]
fn test_load_contig_rejects_unknown_byte() {
let dir = tempfile::tempdir().unwrap();
let path = write_test_fasta(dir.path(), &[("chr1", b"ACZT")]);
let mut fasta = Fasta::from_path(&path).unwrap();
let err = fasta.load_contig("chr1", &mut test_rng()).unwrap_err();
let msg = err.to_string();
assert!(msg.contains("unrecognized base"), "message was: {msg}");
assert!(msg.contains("position 2"), "message was: {msg}");
}
#[test]
fn test_load_contig_iupac_resolved_to_lowercase() {
let dir = tempfile::tempdir().unwrap();
let path = write_test_fasta(dir.path(), &[("chr1", b"ARYSWKMBDHVN")]);
let mut fasta = Fasta::from_path(&path).unwrap();
let seq = fasta.load_contig("chr1", &mut test_rng()).unwrap();
assert_eq!(seq.len(), 12);
assert_eq!(seq[0], b'A');
for (i, &b) in seq.iter().enumerate().skip(1) {
assert!(matches!(b, b'a' | b'c' | b'g' | b't'), "position {i} got {b:?}");
}
}
#[test]
fn test_load_contig_resolution_respects_ambiguity_set() {
let dir = tempfile::tempdir().unwrap();
let path = write_test_fasta(
dir.path(),
&[("chr1", &[b'R'; 200]), ("chr2", &[b'Y'; 200]), ("chr3", &[b'K'; 200])],
);
let mut fasta = Fasta::from_path(&path).unwrap();
let r = fasta.load_contig("chr1", &mut test_rng()).unwrap();
let y = fasta.load_contig("chr2", &mut test_rng()).unwrap();
let k = fasta.load_contig("chr3", &mut test_rng()).unwrap();
for &b in &r {
assert!(matches!(b, b'a' | b'g'), "R resolved to {b:?}");
}
for &b in &y {
assert!(matches!(b, b'c' | b't'), "Y resolved to {b:?}");
}
for &b in &k {
assert!(matches!(b, b'g' | b't'), "K resolved to {b:?}");
}
}
#[test]
fn test_load_contig_resolution_reproducible() {
let dir = tempfile::tempdir().unwrap();
let path = write_test_fasta(dir.path(), &[("chr1", b"NNNNNNNNNNRRYYSSWWKKMMBBDDHHVV")]);
let mut fasta_a = Fasta::from_path(&path).unwrap();
let mut fasta_b = Fasta::from_path(&path).unwrap();
let a = fasta_a.load_contig("chr1", &mut SmallRng::seed_from_u64(1234)).unwrap();
let b = fasta_b.load_contig("chr1", &mut SmallRng::seed_from_u64(1234)).unwrap();
assert_eq!(a, b);
}
}