use std::collections::HashMap;
use std::convert::TryFrom;
use std::fs::File;
use std::io::{BufReader, Read, Seek, SeekFrom};
use std::path::Path;
use std::string::ToString;
use crate::models::Sequence;
use crate::utils::errors::FastaError;
type FastaResult<T> = Result<T, FastaError>;
struct ChromosomeIndex {
name: String,
bases: u64,
start: u64,
line_bases: u64,
line_bytes: u64,
}
impl ChromosomeIndex {
pub fn new(line: &str) -> FastaResult<Self> {
let cols: Vec<&str> = line.split('\t').collect();
if cols.len() != 5 {
return Err(FastaError::new(format!(
"expected 5 columns but received {}",
cols.len()
)));
}
Ok(Self {
name: cols[0].to_string(),
bases: cols[1].parse::<u64>()?,
start: cols[2].parse::<u64>()?,
line_bases: cols[3].parse::<u64>()?,
line_bytes: cols[4].parse::<u64>()?,
})
}
pub fn name(&self) -> &str {
&self.name
}
pub fn offset(&self, pos: u64) -> FastaResult<u64> {
if pos > self.bases {
return Err(FastaError::new(format!(
"position {} is greater than chromome length {}",
pos, self.bases
)));
}
let line_offset = (pos - 1) % self.line_bases;
let line_start = (pos - 1) / self.line_bases * self.line_bytes;
Ok(self.start + line_start + line_offset)
}
}
struct FastaIndex {
chromosomes: HashMap<String, ChromosomeIndex>,
}
impl FastaIndex {
pub fn from_reader<R: std::io::Read>(mut reader: R) -> FastaResult<Self> {
let mut content = String::new();
reader.read_to_string(&mut content)?;
Self::from_str(&content)
}
fn from_str(content: &str) -> FastaResult<Self> {
let mut idx = Self {
chromosomes: HashMap::new(),
};
for line in content.lines() {
let chrom = ChromosomeIndex::new(line)?;
idx.chromosomes.insert(chrom.name().to_string(), chrom);
}
Ok(idx)
}
pub fn offset(&self, chrom: &str, pos: u64) -> FastaResult<u64> {
match self.chromosomes.get(chrom) {
Some(idx) => Ok(idx.offset(pos)?),
None => Err(FastaError::new(format!(
"index for {} does not exist",
chrom
))),
}
}
pub fn offset_range(&self, chrom: &str, start: u64, end: u64) -> FastaResult<(u64, u64)> {
let offset_start = self.offset(chrom, start)?;
let offset_end = self.offset(chrom, end)? + 1;
Ok((offset_start, offset_end))
}
}
pub struct FastaReader<R> {
inner: std::io::BufReader<R>,
idx: FastaIndex,
}
impl FastaReader<File> {
pub fn from_file<P: AsRef<Path> + std::fmt::Display>(path: P) -> FastaResult<Self> {
let fai_path = format!("{}.fai", path);
FastaReader::new(path, fai_path)
}
pub fn new<P: AsRef<Path> + std::fmt::Display, P2: AsRef<Path> + std::fmt::Display>(
fasta_path: P,
fai_path: P2,
) -> FastaResult<Self> {
let fasta_reader = match File::open(fasta_path.as_ref()) {
Ok(x) => x,
Err(err) => {
return Err(FastaError::new(format!(
"unable to open fasta file {}: {}",
fasta_path, err
)))
}
};
let fai_reader = match File::open(fai_path.as_ref()) {
Ok(x) => x,
Err(err) => {
return Err(FastaError::new(format!(
"unable to open fasta index file {}: {}",
fai_path, err
)))
}
};
FastaReader::from_reader(fasta_reader, fai_reader)
}
}
impl<R: std::io::Read> FastaReader<R> {
pub fn from_reader<R2: std::io::Read>(
fasta_reader: R,
fai_reader: R2,
) -> FastaResult<FastaReader<R>> {
Ok(FastaReader {
inner: BufReader::new(fasta_reader),
idx: FastaIndex::from_reader(fai_reader)?,
})
}
}
impl<R: std::io::Read + std::io::Seek> FastaReader<R> {
pub fn read_range(&mut self, chrom: &str, start: u64, end: u64) -> FastaResult<Vec<u8>> {
let (byte_start, byte_end) = self.idx.offset_range(chrom, start, end)?;
self.inner.seek(SeekFrom::Start(byte_start))?;
let capacity = usize::try_from(byte_end - byte_start)?;
let mut buffer: Vec<u8> = vec![0; capacity];
self.inner.read_exact(&mut buffer)?;
Ok(buffer)
}
pub fn read_sequence(&mut self, chrom: &str, start: u64, end: u64) -> FastaResult<Sequence> {
let raw_bytes = self.read_range(chrom, start, end)?;
let length = usize::try_from(end - start)?;
Ok(Sequence::from_raw_bytes(&raw_bytes, length)?)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_fai_reading() {
let fai =
FastaIndex::from_reader(File::open("tests/data/small.fasta.fai").unwrap()).unwrap();
assert_eq!(fai.offset("chr1", 1).unwrap(), 6);
assert_eq!(fai.offset("chr1", 50).unwrap(), 55);
assert_eq!(fai.offset("chr1", 51).unwrap(), 57);
assert_eq!(fai.offset("chr1", 100).unwrap(), 106);
assert_eq!(fai.offset("chr1", 101).unwrap(), 108);
assert_eq!(fai.offset("chr1", 150).unwrap(), 157);
assert_eq!(fai.offset("chr1", 151).unwrap(), 159);
assert_eq!(fai.offset("chr2", 1).unwrap(), 218);
}
#[test]
fn test_fai_errors() {
let fai =
FastaIndex::from_reader(File::open("tests/data/small.fasta.fai").unwrap()).unwrap();
assert_eq!(
fai.offset("chr6", 1).unwrap_err().to_string(),
"index for chr6 does not exist".to_string()
);
assert_eq!(
fai.offset("chr1", 202).unwrap_err().to_string(),
"position 202 is greater than chromome length 201".to_string()
);
assert_eq!(
fai.offset("chr2", 235).unwrap_err().to_string(),
"position 235 is greater than chromome length 234".to_string()
);
assert_eq!(
fai.offset("chr3", 193).unwrap_err().to_string(),
"position 193 is greater than chromome length 192".to_string()
);
assert_eq!(
fai.offset("chr4", 150).unwrap_err().to_string(),
"position 150 is greater than chromome length 149".to_string()
);
assert_eq!(
fai.offset("chrM", 151).unwrap_err().to_string(),
"position 151 is greater than chromome length 150".to_string()
);
}
#[test]
fn test_fasta_reading() {
let mut fasta = FastaReader::from_file("tests/data/small.fasta").unwrap();
let seq = fasta.read_sequence("chr1", 1, 10).unwrap();
assert_eq!(&seq.to_string(), "GCCTCAGAGG");
let seq = fasta.read_sequence("chr1", 51, 53).unwrap();
assert_eq!(&seq.to_string(), "AGG");
let seq = fasta.read_sequence("chr2", 55, 60).unwrap();
assert_eq!(&seq.to_string(), "TCTCAT");
let seq = fasta.read_sequence("chr4", 75, 110).unwrap();
assert_eq!(&seq.to_string(), "GCACACCTCCTGCTTCTAACAGCAGAGCTGCCAGGC");
let seq = fasta.read_sequence("chr1", 201, 201).unwrap();
assert_eq!(&seq.to_string(), "G");
let seq = fasta.read_sequence("chr1", 198, 201).unwrap();
assert_eq!(&seq.to_string(), "GATG");
let seq = fasta.read_sequence("chr4", 148, 149).unwrap();
assert_eq!(&seq.to_string(), "TA");
let seq = fasta.read_sequence("chrM", 101, 150).unwrap();
assert_eq!(
&seq.to_string(),
"TGACCTGCAGGGTCGAGGAGTTGACGGTGCTGAGTTCCCTGCACTCTCAG"
);
let seq = fasta.read_sequence("chrM", 150, 150).unwrap();
assert_eq!(&seq.to_string(), "G");
let seq = fasta.read_sequence("chrM", 1, 150).unwrap();
assert_eq!(seq.len(), 150);
}
}