use std::{
error, fmt,
io::{self, ErrorKind},
str,
};
pub type Result<T> = std::result::Result<T, ParseError>;
#[derive(Debug)]
pub enum ParseError {
Io(io::Error),
TruncateFile(String),
InvalidFastx(String),
InvalidFasta(String),
InvalidFastq(String),
}
impl fmt::Display for ParseError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
ParseError::Io(err) => write!(f, "IO error: {}", err),
ParseError::TruncateFile(record) => {
write!(f, "Truncate file, problematic record: {}", record)
}
ParseError::InvalidFastx(record) => {
write!(f, "Not a valid fastx record: {}", record)
}
ParseError::InvalidFasta(record) => {
write!(f, "Not a valid fasta record: {}", record)
}
ParseError::InvalidFastq(record) => {
write!(f, "Not a valid fastq record: {}", record)
}
}
}
}
impl From<io::Error> for ParseError {
fn from(err: io::Error) -> ParseError {
ParseError::Io(err)
}
}
impl error::Error for ParseError {}
pub struct Fastx<'a> {
_head: usize,
_des: usize,
_seq: usize,
_sep: usize,
_qual: usize,
_data: &'a Vec<u8>,
}
impl Fastx<'_> {
#[inline]
pub fn head(&self) -> &str {
unsafe { str::from_utf8_unchecked(&self._data[1..self._head]) }
}
#[inline]
pub fn seq(&self) -> &str {
unsafe { str::from_utf8_unchecked(&self._data[self._des..self._seq]) }
}
#[inline]
pub fn des(&self) -> &str {
if self._head < self._des {
unsafe { str::from_utf8_unchecked(&self._data[self._head..self._des]) }
} else {
""
}
}
#[inline]
pub fn sep(&self) -> &str {
if self._seq < self._sep {
unsafe { str::from_utf8_unchecked(&self._data[self._seq..self._sep]) }
} else {
""
}
}
#[inline]
pub fn qual(&self) -> &str {
if self._sep < self._qual {
unsafe { str::from_utf8_unchecked(&self._data[self._sep..self._qual]) }
} else {
""
}
}
#[inline]
pub fn len(&self) -> usize {
self._seq - self._des
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn is_fasta(&self) -> bool {
(!self._data.is_empty()) && self._data[0] == b'>'
}
pub fn is_fastq(&self) -> bool {
(!self._data.is_empty()) && self._data[0] == b'@'
}
fn validate_fastq(&self) -> bool {
self.is_fastq() && !self.is_empty() && self._seq - self._des == self._qual - self._sep && self._head > 1
}
fn validate_fasta(&self) -> bool {
self.is_fasta() && !self.is_empty() && self._head > 1
}
}
pub struct Reader<'a> {
reader: Box<dyn io::BufRead + 'a>,
data: Vec<u8>,
}
impl<'a> Reader<'a> {
pub(crate) fn new(r: Box<dyn io::BufRead + 'a>) -> Self {
Reader {
reader: r,
data: Vec::with_capacity(1024),
}
}
fn has_data_left(&mut self) -> Result<bool> {
loop{
let available = self.reader.fill_buf().map_err(ParseError::Io)?;
if available.iter().any(|&x| !char::is_whitespace(x as char)){
return Ok(true);
}else if available.is_empty() {
return Ok(false);
}
let len = available.len();
self.reader.consume(len);
}
}
#[allow(dead_code)]
fn next_byte(&mut self) -> Result<Option<u8>> {
loop {
match self.reader.fill_buf() {
Ok(n) => return Ok(n.first().copied()),
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
Err(e) => return Err(ParseError::Io(e)),
};
}
}
fn read_line(&mut self, skip_blank_line: bool) -> Result<usize> {
let delim = b'\n';
loop {
let mut n = self.reader.read_until(delim, &mut self.data)?;
if n == 0 {
return Ok(n);
}
if self.data.last() == Some(&delim) {
self.data.pop();
n -= 1;
}
if n != 0 || !skip_blank_line {
return Ok(n);
}
}
}
fn read_until(&mut self, delim: u8) -> Result<usize> {
let mut read = 0;
loop {
let (done, used) = {
let available = match self.reader.fill_buf() {
Ok(n) => n,
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
Err(e) => return Err(ParseError::Io(e)),
};
let mut s = 0;
let mut mch = memchr::memchr2_iter(delim, b'\n', available);
loop {
match mch.next() {
Some(i) => {
self.data.extend_from_slice(&available[s..i]);
read += i - s;
s = i + 1;
if available[i] == delim {
break (true, i);
}
}
None => {
self.data.extend_from_slice(&available[s..]);
read += available.len() - s;
break (false, available.len());
}
}
}
};
self.reader.consume(used);
if done || used == 0 {
return Ok(read);
}
}
}
fn read_exact(&mut self, len: usize) -> Result<usize> {
let mut read = 0;
loop {
let (done, used) = {
let available = match self.reader.fill_buf() {
Ok(n) => n,
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
Err(e) => return Err(ParseError::Io(e)),
};
let mut s = 0;
let mut mch = memchr::memchr_iter(b'\n', available);
loop {
match mch.next() {
Some(i) => {
if read + i - s >= len {
let e = len - read + s;
self.data.extend_from_slice(&available[s..e]);
read = len;
break (true, e);
} else {
self.data.extend_from_slice(&available[s..i]);
read += i - s;
}
s = i + 1;
}
None => {
if available.len() - s + read >= len {
let e = len - read + s;
self.data.extend_from_slice(&available[s..e]);
read = len;
break (true, e);
} else {
self.data.extend_from_slice(&available[s..]);
read += available.len() - s;
break (false, available.len());
}
}
}
}
};
self.reader.consume(used);
if done || used == 0 {
return Ok(read);
}
}
}
pub fn iter_record(&mut self) -> Result<Option<Fastx>> {
self.data.clear();
let des = self.read_line(true)?;
if des == 0 {
return Ok(None);
} else if self.data[0] != b'>' && self.data[0] != b'@' {
return Err(ParseError::InvalidFastx(
String::from_utf8(self.data.to_owned()).unwrap(),
));
}
let head = self
.data
.iter()
.position(|&x| char::is_whitespace(x as char))
.map_or_else(|| des, |x| x);
let mut seq = des;
let mut sep = seq;
let mut qual = sep;
let is_fasta = self.data[0] == b'>';
if is_fasta {
seq += self.read_until(b'>')?;
} else {
seq += self.read_until(b'+')?;
sep = seq + self.read_line(true)?;
qual = sep + self.read_exact(seq - des)?;
}
if !self.has_data_left()? && (head == 1 || seq == des || (!is_fasta && (sep == seq || qual == sep))){
return Err(ParseError::TruncateFile(
String::from_utf8(self.data.to_owned()).unwrap(),
));
}
let fastx = Fastx {
_head: head,
_des: des,
_seq: seq,
_sep: sep,
_qual: qual,
_data: &self.data,
};
if is_fasta && !fastx.validate_fasta() {
return Err(ParseError::InvalidFasta(fastx.head().to_string()));
} else if !(is_fasta || fastx.validate_fastq()) {
return Err(ParseError::InvalidFastq(fastx.head().to_string()));
}
Ok(Some(fastx))
}
}
pub struct Readers<'a> {
index: usize,
pub(crate) readers: Vec<Reader<'a>>,
}
impl<'a> Default for Readers<'a> {
fn default() -> Self {
Self::new()
}
}
impl<'a> Readers<'a> {
pub(crate) fn new() -> Self {
Readers {
index: 0,
readers: Vec::new(),
}
}
pub(crate) fn iter_record(&mut self) -> Result<Option<Fastx>> {
for idx in self.index..self.readers.len() {
if self.readers[idx].has_data_left()? {
return self.readers[idx].iter_record();
}
self.index += 1;
}
Ok(None)
}
}