use std::io::prelude::*;
use bio::proteins::{AverageMass, ProteinMass};
use traits::*;
use util::*;
use super::evidence::ProteinEvidence;
use super::re::*;
use super::record::Record;
use super::record_list::RecordList;
pub struct FastaIter<T: BufRead> {
reader: T,
buf: BufferType,
line: String,
}
impl<T: BufRead> FastaIter<T> {
#[inline]
pub fn new(reader: T) -> Self {
FastaIter {
reader: reader,
buf: Vec::with_capacity(8000),
line: String::with_capacity(8000)
}
}
}
impl<T: BufRead> Iterator for FastaIter<T> {
type Item = ResultType<String>;
fn next(&mut self) -> Option<Self::Item> {
text_next_skip_whitespace(">", &mut self.reader, &mut self.buf, &mut self.line)
}
}
#[inline]
fn estimate_record_size(record: &Record) -> usize {
const FASTA_VOCABULARY_SIZE: usize = 40;
FASTA_VOCABULARY_SIZE +
record.gene.len() +
record.id.len() +
record.mnemonic.len() +
record.name.len() +
record.organism.len() +
record.taxonomy.len() +
record.sequence.len()
}
#[inline]
fn estimate_list_size(list: &RecordList) -> usize {
list.iter().fold(0, |sum, x| sum + estimate_record_size(x))
}
pub fn write_swissprot_header<T: Write>(record: &Record, writer: &mut T)
-> ResultType<()>
{
write_alls!(
writer,
b">sp|", record.id.as_bytes(),
b"|", record.mnemonic.as_bytes(),
b" ", record.name.as_bytes(),
b" OS=", record.organism.as_bytes()
)?;
if !record.taxonomy.is_empty() {
write_alls!(writer, b" OX=", record.taxonomy.as_bytes())?;
}
if !record.gene.is_empty() {
write_alls!(writer, b" GN=", record.gene.as_bytes())?;
}
write_alls!(
writer,
b" PE=", record.protein_evidence.ntoa()?.as_bytes(),
b" SV=", record.sequence_version.ntoa()?.as_bytes()
)?;
Ok(())
}
pub fn write_trembl_header<T: Write>(record: &Record, writer: &mut T)
-> ResultType<()>
{
write_alls!(
writer,
b">tr|", record.id.as_bytes(),
b"|", record.mnemonic.as_bytes(),
b" ", record.name.as_bytes(),
b" OS=", record.organism.as_bytes()
)?;
if !record.taxonomy.is_empty() {
write_alls!(writer, b" OX=", record.taxonomy.as_bytes())?;
}
if !record.gene.is_empty() {
write_alls!(writer, b" GN=", record.gene.as_bytes())?;
}
write_alls!(
writer,
b" PE=", record.protein_evidence.ntoa()?.as_bytes(),
b" SV=", record.sequence_version.ntoa()?.as_bytes()
)?;
Ok(())
}
#[inline(always)]
fn to_fasta<'a, T: Write>(writer: &mut T, record: &'a Record) -> ResultType<()> {
record_to_fasta(writer, record)
}
pub fn record_to_fasta<T: Write>(writer: &mut T, record: &Record)
-> ResultType<()>
{
if record.reviewed {
write_swissprot_header(record, writer)?;
} else {
write_trembl_header(record, writer)?;
}
const SEQUENCE_LINE_LENGTH: usize = 60;
let mut bytes = record.sequence.as_slice();
while bytes.len() > SEQUENCE_LINE_LENGTH {
let prefix = &bytes[0..SEQUENCE_LINE_LENGTH];
bytes = &bytes[SEQUENCE_LINE_LENGTH..];
writer.write_all(b"\n")?;
writer.write_all(prefix)?;
}
if !bytes.is_empty() {
writer.write_all(b"\n")?;
writer.write_all(bytes)?;
}
Ok(())
}
#[inline(always)]
fn init_cb<T: Write>(writer: &mut T, delimiter: u8)
-> ResultType<TextWriterState<T>>
{
Ok(TextWriterState::new(writer, delimiter))
}
#[inline(always)]
fn export_cb<'a, T: Write>(writer: &mut TextWriterState<T>, record: &'a Record)
-> ResultType<()>
{
writer.export(record, &to_fasta)
}
#[inline(always)]
fn dest_cb<T: Write>(_: &mut TextWriterState<T>)
-> ResultType<()>
{
Ok(())
}
#[inline(always)]
pub fn reference_iterator_to_fasta<'a, Iter, T>(writer: &mut T, iter: Iter)
-> ResultType<()>
where T: Write,
Iter: Iterator<Item = &'a Record>
{
reference_iterator_export(writer, iter, b'\n', &init_cb, &export_cb, &dest_cb)
}
#[inline(always)]
pub fn value_iterator_to_fasta<Iter, T>(writer: &mut T, iter: Iter)
-> ResultType<()>
where T: Write,
Iter: Iterator<Item = ResultType<Record>>
{
value_iterator_export(writer, iter, b'\n', &init_cb, &export_cb, &dest_cb)
}
#[inline(always)]
pub fn reference_iterator_to_fasta_strict<'a, Iter, T>(writer: &mut T, iter: Iter)
-> ResultType<()>
where T: Write,
Iter: Iterator<Item = &'a Record>
{
reference_iterator_export_strict(writer, iter, b'\n', &init_cb, &export_cb, &dest_cb)
}
#[inline(always)]
pub fn value_iterator_to_fasta_strict<Iter, T>(writer: &mut T, iter: Iter)
-> ResultType<()>
where T: Write,
Iter: Iterator<Item = ResultType<Record>>
{
value_iterator_export_strict(writer, iter, b'\n', &init_cb, &export_cb, &dest_cb)
}
#[inline(always)]
pub fn reference_iterator_to_fasta_lenient<'a, Iter, T>(writer: &mut T, iter: Iter)
-> ResultType<()>
where T: Write,
Iter: Iterator<Item = &'a Record>
{
reference_iterator_export_lenient(writer, iter, b'\n', &init_cb, &export_cb, &dest_cb)
}
#[inline(always)]
pub fn value_iterator_to_fasta_lenient<Iter, T>(writer: &mut T, iter: Iter)
-> ResultType<()>
where T: Write,
Iter: Iterator<Item = ResultType<Record>>
{
value_iterator_export_lenient(writer, iter, b'\n', &init_cb, &export_cb, &dest_cb)
}
fn record_header_from_swissprot(header: &str) -> ResultType<Record> {
type R = SwissProtHeaderRegex;
let captures = none_to_error!(R::extract().captures(&header), InvalidInput);
let pe = capture_as_str(&captures, R::PE_INDEX);
let sv = capture_as_str(&captures, R::SV_INDEX);
Ok(Record {
sequence_version: sv.parse().unwrap(),
protein_evidence: ProteinEvidence::from_str(pe)?,
mass: 0,
length: 0,
gene: optional_capture_as_string(&captures, R::GENE_INDEX),
id: capture_as_string(&captures, R::ACCESSION_INDEX),
mnemonic: capture_as_string(&captures, R::MNEMONIC_INDEX),
name: capture_as_string(&captures, R::NAME_INDEX),
organism: capture_as_string(&captures, R::ORGANISM_INDEX),
taxonomy: optional_capture_as_string(&captures, R::TAXONOMY_INDEX),
reviewed: true,
proteome: String::new(),
sequence: vec![],
})
}
fn record_header_from_trembl(header: &str) -> ResultType<Record> {
type R = TrEMBLHeaderRegex;
let captures = none_to_error!(R::extract().captures(&header), InvalidInput);
let pe = capture_as_str(&captures, R::PE_INDEX);
let sv = capture_as_str(&captures, R::SV_INDEX);
Ok(Record {
sequence_version: sv.parse().unwrap(),
protein_evidence: ProteinEvidence::from_str(pe)?,
mass: 0,
length: 0,
gene: optional_capture_as_string(&captures, R::GENE_INDEX),
id: capture_as_string(&captures, R::ACCESSION_INDEX),
mnemonic: capture_as_string(&captures, R::MNEMONIC_INDEX),
name: capture_as_string(&captures, R::NAME_INDEX),
organism: capture_as_string(&captures, R::ORGANISM_INDEX),
taxonomy: optional_capture_as_string(&captures, R::TAXONOMY_INDEX),
reviewed: false,
proteome: String::new(),
sequence: vec![],
})
}
pub fn record_from_fasta<T: BufRead>(reader: &mut T)
-> ResultType<Record>
{
let mut lines = reader.lines();
let header = none_to_error!(lines.next(), InvalidInput)?;
bool_to_error!(header.len() >= 3, InvalidInput);
let mut record = match &header[..3] {
">sp" => record_header_from_swissprot(&header)?,
">tr" => record_header_from_trembl(&header)?,
_ => return Err(From::from(ErrorKind::InvalidFastaType)),
};
for line in lines {
record.sequence.append(&mut line?.into_bytes());
}
if record.sequence.len() > 0 {
record.length = record.sequence.len() as u32;
let mass = AverageMass::protein_sequence_mass(record.sequence.as_slice());
record.mass = mass.round() as u64;
}
Ok(record)
}
pub struct FastaRecordIter<T: BufRead> {
iter: FastaIter<T>
}
impl<T: BufRead> FastaRecordIter<T> {
#[inline]
pub fn new(reader: T) -> Self {
FastaRecordIter {
iter: FastaIter::new(reader)
}
}
}
impl<T: BufRead> Iterator for FastaRecordIter<T> {
type Item = ResultType<Record>;
fn next(&mut self) -> Option<Self::Item> {
let text = match self.iter.next()? {
Err(e) => return Some(Err(e)),
Ok(text) => text,
};
Some(Record::from_fasta_string(&text))
}
}
#[inline(always)]
pub fn iterator_from_fasta<T: BufRead>(reader: T) -> FastaRecordIter<T> {
FastaRecordIter::new(reader)
}
pub type FastaRecordStrictIter<T> = StrictIter<Record, FastaRecordIter<T>>;
#[inline(always)]
pub fn iterator_from_fasta_strict<T: BufRead>(reader: T) -> FastaRecordStrictIter<T> {
FastaRecordStrictIter::new(iterator_from_fasta(reader))
}
pub type FastaRecordLenientIter<T> = LenientIter<Record, FastaRecordIter<T>>;
#[inline(always)]
pub fn iterator_from_fasta_lenient<T: BufRead>(reader: T) -> FastaRecordLenientIter<T> {
FastaRecordLenientIter::new(iterator_from_fasta(reader))
}
impl Fasta for Record {
#[inline]
fn estimate_fasta_size(&self) -> usize {
estimate_record_size(self)
}
#[inline(always)]
fn to_fasta<T: Write>(&self, writer: &mut T) -> ResultType<()> {
record_to_fasta(writer, self)
}
fn from_fasta<T: BufRead>(reader: &mut T) -> ResultType<Self> {
record_from_fasta(reader)
}
}
impl Fasta for RecordList {
#[inline]
fn estimate_fasta_size(&self) -> usize {
estimate_list_size(self)
}
#[inline(always)]
fn to_fasta<T: Write>(&self, writer: &mut T) -> ResultType<()> {
reference_iterator_to_fasta(writer, self.iter())
}
#[inline(always)]
fn from_fasta<T: BufRead>(reader: &mut T) -> ResultType<RecordList> {
iterator_from_fasta(reader).collect()
}
}
impl FastaCollection for RecordList {
#[inline(always)]
fn to_fasta_strict<T: Write>(&self, writer: &mut T) -> ResultType<()> {
reference_iterator_to_fasta_strict(writer, self.iter())
}
#[inline(always)]
fn to_fasta_lenient<T: Write>(&self, writer: &mut T) -> ResultType<()> {
reference_iterator_to_fasta_lenient(writer, self.iter())
}
#[inline(always)]
fn from_fasta_strict<T: BufRead>(reader: &mut T) -> ResultType<RecordList> {
iterator_from_fasta_strict(reader).collect()
}
#[inline(always)]
fn from_fasta_lenient<T: BufRead>(reader: &mut T) -> ResultType<RecordList> {
Ok(iterator_from_fasta_lenient(reader).filter_map(Result::ok).collect())
}
}
#[cfg(test)]
mod tests {
use bencher;
use std::fs::File;
use std::io::{BufReader, Cursor};
use std::path::PathBuf;
use test::testdata_dir;
use super::*;
use super::super::test::*;
#[test]
fn fasta_iter_test() {
let s = ">tr\nXX\n>sp\nXX\nXX\n>tr\n";
let i = FastaIter::new(Cursor::new(s));
let r: ResultType<Vec<String>> = i.collect();
assert_eq!(r.unwrap(), &[">tr\nXX\n", ">sp\nXX\nXX\n", ">tr\n"]);
let s = "";
let i = FastaIter::new(Cursor::new(s));
let r: ResultType<Vec<String>> = i.collect();
assert_eq!(r.unwrap(), Vec::<String>::new());
}
#[test]
fn estimate_size_test() {
let g = gapdh();
let b = bsa();
let v = vec![gapdh(), bsa()];
assert_eq!(estimate_record_size(&g), 458);
assert_eq!(estimate_record_size(&b), 693);
assert_eq!(estimate_list_size(&v), 1151);
}
#[test]
fn iterator_to_fasta_test() {
let v = vec![gapdh(), bsa()];
let u = vec![gapdh(), bsa(), Record::new()];
let mut w = Cursor::new(vec![]);
reference_iterator_to_fasta(&mut w, v.iter()).unwrap();
assert_eq!(String::from_utf8(w.into_inner()).unwrap(), GAPDH_BSA_FASTA);
let mut w = Cursor::new(vec![]);
value_iterator_to_fasta(&mut w, iterator_by_value!(v.iter())).unwrap();
assert_eq!(String::from_utf8(w.into_inner()).unwrap(), GAPDH_BSA_FASTA);
let mut w = Cursor::new(vec![]);
reference_iterator_to_fasta_strict(&mut w, v.iter()).unwrap();
assert_eq!(String::from_utf8(w.into_inner()).unwrap(), GAPDH_BSA_FASTA);
let mut w = Cursor::new(vec![]);
let r = reference_iterator_to_fasta_strict(&mut w, u.iter());
assert!(r.is_err());
let mut w = Cursor::new(vec![]);
value_iterator_to_fasta_strict(&mut w, iterator_by_value!(v.iter())).unwrap();
assert_eq!(String::from_utf8(w.into_inner()).unwrap(), GAPDH_BSA_FASTA);
let mut w = Cursor::new(vec![]);
let r = value_iterator_to_fasta_strict(&mut w, iterator_by_value!(u.iter()));
assert!(r.is_err());
let mut w = Cursor::new(vec![]);
reference_iterator_to_fasta_lenient(&mut w, v.iter()).unwrap();
assert_eq!(String::from_utf8(w.into_inner()).unwrap(), GAPDH_BSA_FASTA);
let mut w = Cursor::new(vec![]);
reference_iterator_to_fasta_lenient(&mut w, u.iter()).unwrap();
assert_eq!(String::from_utf8(w.into_inner()).unwrap(), GAPDH_BSA_FASTA);
let mut w = Cursor::new(vec![]);
value_iterator_to_fasta_lenient(&mut w, iterator_by_value!(v.iter())).unwrap();
assert_eq!(String::from_utf8(w.into_inner()).unwrap(), GAPDH_BSA_FASTA);
let mut w = Cursor::new(vec![]);
value_iterator_to_fasta_lenient(&mut w, iterator_by_value!(u.iter())).unwrap();
assert_eq!(String::from_utf8(w.into_inner()).unwrap(), GAPDH_BSA_FASTA);
}
#[test]
fn iterator_from_fasta_test() {
let text = GAPDH_BSA_FASTA;
let expected = vec![gapdh(), bsa()];
let iter = FastaRecordIter::new(Cursor::new(text));
let v: ResultType<RecordList> = iter.collect();
incomplete_list_eq(&expected, &v.unwrap());
iterator_from_fasta(&mut Cursor::new(text));
let iter = iterator_from_fasta_strict(Cursor::new(text));
let v: ResultType<RecordList> = iter.collect();
incomplete_list_eq(&expected, &v.unwrap());
iterator_from_fasta_strict(&mut Cursor::new(text));
let iter = iterator_from_fasta_lenient(Cursor::new(text));
let v: ResultType<RecordList> = iter.collect();
incomplete_list_eq(&expected, &v.unwrap());
iterator_from_fasta_lenient(&mut Cursor::new(text));
let text = GAPDH_EMPTY_FASTA;
let expected1 = vec![gapdh(), Record::new()];
let expected2 = vec![gapdh()];
let iter = iterator_from_fasta(Cursor::new(text));
let v: ResultType<RecordList> = iter.collect();
let v = v.unwrap();
assert_eq!(expected1.len(), v.len());
incomplete_eq(&expected1[0], &v[0]);
assert_eq!(expected1[1], v[1]);
let iter = iterator_from_fasta_strict(Cursor::new(text));
let v: ResultType<RecordList> = iter.collect();
assert!(v.is_err());
let iter = iterator_from_fasta_lenient(Cursor::new(text));
let v: ResultType<RecordList> = iter.collect();
incomplete_list_eq(&expected2, &v.unwrap());
}
fn fasta_dir() -> PathBuf {
let mut dir = testdata_dir();
dir.push("uniprot/fasta");
dir
}
#[test]
#[ignore]
fn human_fasta_test() {
let mut path = fasta_dir();
path.push("human.fasta");
let reader = BufReader::new(File::open(path).unwrap());
let iter = FastaRecordIter::new(reader);
for item in iter {
bencher::black_box(item).unwrap();
}
}
}