use std::fmt::{self, Write};
use std::fs::File;
use std::io::{self, BufRead, BufReader, Read};
use std::path::Path;
use indexmap::IndexSet;
use memmap2::{Mmap, MmapOptions};
#[derive(Debug, Clone)]
pub struct Fai {
chromosomes: Vec<FaiRecord>,
name_map: IndexSet<String>,
}
impl Fai {
pub fn from_file<P: AsRef<Path>>(path: P) -> io::Result<Self> {
let f = File::open(path)?;
let br = BufReader::new(f);
let mut name_map = IndexSet::new();
let mut chromosomes = Vec::new();
for l in br.lines() {
let line = l?;
let p: Vec<_> = line.split('\t').collect();
if p.len() != 5 {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"Expected 5 columns in .fai file.",
));
}
name_map.insert(p[0].to_owned());
let ioerr =
|e, msg| io::Error::new(io::ErrorKind::InvalidData, format!("{}:{}", msg, e));
chromosomes.push(FaiRecord {
len: p[1]
.parse()
.map_err(|e| ioerr(e, "Error parsing chr len in .fai"))?,
offset: p[2]
.parse()
.map_err(|e| ioerr(e, "Error parsing chr offset in .fai"))?,
line_bases: p[3]
.parse()
.map_err(|e| ioerr(e, "Error parsing chr line_bases in .fai"))?,
line_width: p[4]
.parse()
.map_err(|e| ioerr(e, "Error parsing chr line_width in .fai"))?,
});
}
Ok(Fai {
chromosomes,
name_map,
})
}
#[inline]
pub fn offset(&self, tid: usize, start: usize, stop: usize) -> io::Result<(usize, usize)> {
let chr = &self.chromosomes.get(tid).ok_or_else(|| {
io::Error::other("Chromomsome tid was out of bounds")
})?;
if stop > chr.len {
return Err(io::Error::other("FASTA read interval was out of bounds"));
}
let start_offset =
chr.offset + (start / chr.line_bases) * chr.line_width + start % chr.line_bases;
let stop_offset =
chr.offset + (stop / chr.line_bases) * chr.line_width + stop % chr.line_bases;
Ok((start_offset, stop_offset))
}
#[inline]
pub fn offset_tid(&self, tid: usize) -> io::Result<(usize, usize)> {
let chr = &self.chromosomes.get(tid).ok_or_else(|| {
io::Error::other("Chromomsome tid was out of bounds")
})?;
let start_offset = chr.offset;
let stop_offset =
chr.offset + (chr.len / chr.line_bases) * chr.line_width + chr.len % chr.line_bases;
Ok((start_offset, stop_offset))
}
#[inline]
pub fn tid(&self, name: &str) -> Option<usize> {
self.name_map.get_index_of(name)
}
pub fn size(&self, tid: usize) -> io::Result<usize> {
let chr = &self.chromosomes.get(tid).ok_or_else(|| {
io::Error::other("Chromomsome tid was out of bounds")
})?;
Ok(chr.len)
}
pub fn name(&self, tid: usize) -> io::Result<&String> {
self.name_map.get_index(tid).ok_or_else(|| {
io::Error::other("Chromomsome tid was out of bounds")
})
}
pub fn names(&self) -> Vec<&str> {
self.name_map.iter().map(|s| s.as_str()).collect()
}
}
#[derive(Debug, Clone)]
pub struct FaiRecord {
len: usize,
offset: usize,
line_bases: usize,
line_width: usize,
}
pub struct IndexedFasta {
mmap: Mmap,
fasta_index: Fai,
}
impl IndexedFasta {
pub fn from_file<P: AsRef<Path>>(path: P) -> io::Result<Self> {
let mut fai_path = path.as_ref().as_os_str().to_owned();
fai_path.push(".fai");
let fasta_index = Fai::from_file(&fai_path)?;
let file = File::open(path)?;
let mmap = unsafe { MmapOptions::new().map(&file)? };
Ok(IndexedFasta { mmap, fasta_index })
}
pub fn view(&self, tid: usize, start: usize, stop: usize) -> io::Result<FastaView<'_>> {
if start > stop {
return Err(io::Error::other("Invalid query interval"));
}
let (start_byte, stop_byte) = self.fasta_index.offset(tid, start, stop)?;
Ok(FastaView(&self.mmap[start_byte..stop_byte]))
}
pub fn view_tid(&self, tid: usize) -> io::Result<FastaView<'_>> {
let (start_byte, stop_byte) = self.fasta_index.offset_tid(tid)?;
Ok(FastaView(&self.mmap[start_byte..stop_byte]))
}
pub fn fai(&self) -> &Fai {
&self.fasta_index
}
}
pub struct FastaView<'a>(&'a [u8]);
impl<'a> FastaView<'a> {
pub fn count_bases(&self) -> BaseCounts {
let mut bc: BaseCounts = Default::default();
for b in self.bases() {
let v: u8 = b << 3;
if v ^ 8 == 0 {
bc.a += 1;
} else if v ^ 24 == 0 {
bc.c += 1;
} else if v ^ 56 == 0 {
bc.g += 1;
} else if v ^ 112 == 0 {
bc.n += 1;
} else if v ^ 160 == 0 {
bc.t += 1;
} else {
bc.other += 1;
}
}
bc
}
pub fn bases(&self) -> impl Iterator<Item = &'a u8> {
self.0.iter().filter(|&&b| b & 192 == 64)
}
}
impl<'a> std::fmt::Display for FastaView<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.bases().try_for_each(|&c| f.write_char(c as char))
}
}
impl<'a> Read for FastaView<'a> {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
let mut read = 0;
let mut skipped = 0;
for (t, s) in buf.iter_mut().zip(self.0.iter().filter(|&&c| {
let base = c & 192 == 64;
if !base {
skipped += 1;
}
base
})) {
*t = *s;
read += 1;
}
self.0 = &self.0[(skipped + read)..];
Ok(read)
}
}
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct BaseCounts {
pub a: usize,
pub c: usize,
pub g: usize,
pub t: usize,
pub n: usize,
pub other: usize,
}
impl Default for BaseCounts {
fn default() -> BaseCounts {
BaseCounts {
a: 0,
c: 0,
g: 0,
t: 0,
n: 0,
other: 0,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn fai() {
let ir = IndexedFasta::from_file("test/genome.fa").unwrap();
assert_eq!(ir.fai().names().len(), 3);
assert_eq!(ir.fai().tid("ACGT-25"), Some(2));
assert_eq!(ir.fai().tid("NotFound"), None);
assert_eq!(ir.fai().size(2).unwrap(), 100);
assert_eq!(ir.fai().name(2).unwrap(), "ACGT-25");
assert!(ir.fai().name(3).is_err());
}
#[test]
fn view() {
let ir = IndexedFasta::from_file("test/genome.fa").unwrap();
assert_eq!(ir.view(0, 0, 10).unwrap().to_string(), "AAAAAAAAAA");
assert!(ir.view(0, 0, 11).is_err());
assert_eq!(
ir.view(2, 38, 62).unwrap().to_string(),
"CCCCCCCCCCCCGGGGGGGGGGGG"
);
assert_eq!(
ir.view(2, 74, 100).unwrap().to_string(),
"GTTTTTTTTTTTTTTTTTTTTTTTTT"
);
assert!(ir.view(0, 120, 130).is_err());
}
#[test]
fn view_tid() {
let ir = IndexedFasta::from_file("test/genome.fa").unwrap();
assert_eq!(ir.view_tid(0).unwrap().to_string(), "AAAAAAAAAA");
assert_eq!(ir.view_tid(1).unwrap().to_string(),
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA");
assert_eq!(ir.view_tid(2).unwrap().to_string(),
"AAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTT");
assert!(ir.view_tid(3).is_err());
}
#[test]
fn view_bases() {
let ir = IndexedFasta::from_file("test/genome.fa").unwrap();
let v = ir.view(2, 48, 52).unwrap();
let mut b = v.bases();
assert_eq!(b.next(), Some(&b'C'));
assert_eq!(b.next(), Some(&b'C'));
assert_eq!(b.next(), Some(&b'G'));
assert_eq!(b.next(), Some(&b'G'));
assert_eq!(b.next(), None);
}
#[test]
fn view_counts() {
let ir = IndexedFasta::from_file("test/genome.fa").unwrap();
assert_eq!(
ir.view(2, 48, 52).unwrap().count_bases(),
BaseCounts {
c: 2,
g: 2,
..Default::default()
}
);
}
#[test]
fn read_view() {
let ir = IndexedFasta::from_file("test/genome.fa").unwrap();
let mut buf = vec![0; 25];
let mut v = ir.view_tid(2).unwrap();
println!("{}", v.to_string());
assert_eq!(v.read(&mut buf).unwrap(), 25);
assert_eq!(buf, vec![b'A'; 25]);
assert_eq!(v.read(&mut buf).unwrap(), 25);
assert_eq!(buf, vec![b'C'; 25]);
assert_eq!(v.read(&mut buf).unwrap(), 25);
assert_eq!(buf, vec![b'G'; 25]);
let mut buf2 = vec![0; 10];
assert_eq!(v.read(&mut buf2).unwrap(), 10);
assert_eq!(buf2, vec![b'T'; 10]);
assert_eq!(v.read(&mut buf2).unwrap(), 10);
assert_eq!(buf2, vec![b'T'; 10]);
assert_eq!(v.read(&mut buf2).unwrap(), 5);
assert_eq!(&buf2[0..5], vec![b'T'; 5].as_slice());
}
}