1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
//! Allows operations over the [FASTA format](https://en.wikipedia.org/wiki/FASTA_format).


use std::io;
use std::io::BufRead;
use std::io::BufReader;
use std::io::Lines;
use std::io::Read;
use std::io::Write;
use std::iter::Peekable;

use errors;
use errors::Result;

const BUFFER_SIZE: usize = 10_000_000; // 10MB
const FASTA_WIDTH: usize = 70;

/// Reads a FASTA-formatted source (e.g. a file).
pub struct Reader<R: Read> {
	lines: Peekable<Lines<BufReader<R>>>,
	unwrap: bool,
}


impl<R: Read> Reader<R> {
	/// Creates a Reader from the given Read (e.g. a file).
	/// When unwrap is `true`, the reader will unwrap the input sequences by
	/// removing newlines between the sequence lines
	/// (each [Record.sequence](struct.Record.html)) will only have one item.
	/// When unwrap is `false`, each record line will be a new item in
	/// the [Record.sequence](struct.Record.html) vec.
	pub fn new(reader: R, unwrap: bool) -> Self {
		let lines = BufReader::with_capacity(BUFFER_SIZE, reader).lines()
		                                                         .peekable();
		Reader { unwrap, lines }
	}

	/// Reads the next record from the FASTA file.
	pub fn read_record(&mut self) -> Result<Option<Record>> {
		let mut header = match self.lines.next() {
			None => return Ok(None),
			Some(header) => header?,
		};

		if !header.starts_with('>') {
			bail!(errors::ErrorKind::Io(io::Error::new(
				io::ErrorKind::Other,
				"Expected > at beginning of fasta header."
			)));
		}
		let _ = header.remove(0);

		let mut sequence = Vec::new();
		while self.lines
		          .peek()
		          .and_then(|line| line.as_ref().ok())
		          .map(|line| !line.starts_with('>'))
		          .unwrap_or(false)
		{
			sequence.push(self.lines.next().unwrap()?);
		}
		if self.unwrap {
			sequence = vec![sequence.concat()];
		}

		Ok(Some(Record { header, sequence }))
	}

	/// Returns a Records struct with itself as its reader.
	pub fn records(self) -> Records<R> {
		Records { reader: self }
	}
}

/// A record extending the FASTA format. A single header can be followed by multiple
/// sequences/items.
#[derive(Debug)]
pub struct Record {
	/// The record header (without the preceding '>')
	pub header: String,

	/// The actual sequence of nucleotides
	pub sequence: Vec<String>,
}

/// Convenience struct which allows for iteration (e.g. using for..in).
pub struct Records<R: Read> {
	reader: Reader<R>,
}

impl<R: Read> Records<R> {
	/// Convert this to a chunked version which processes the records in chunks
	/// of the given size.
	pub fn chunked(self, size: usize) -> ChunkedRecords<R> {
		ChunkedRecords { reader: self.reader,
		                 chunk_size: size }
	}
}

impl<R: Read> Iterator for Records<R> {
	type Item = Result<Record>;

	fn next(&mut self) -> Option<Result<Record>> {
		match self.reader.read_record() {
			Ok(None) => None,
			Ok(Some(record)) => Some(Ok(record)),
			Err(err) => Some(Err(err)),
		}
	}
}

/// Allows for iteration over records in chunks
pub struct ChunkedRecords<R: Read> {
	reader: Reader<R>,
	chunk_size: usize,
}

impl<R: Read> Iterator for ChunkedRecords<R> {
	type Item = Result<Vec<Record>>;

	fn next(&mut self) -> Option<Result<Vec<Record>>> {
		let mut chunk = Vec::with_capacity(self.chunk_size);
		while chunk.len() < self.chunk_size {
			match self.reader.read_record() {
				Ok(Some(result)) => chunk.push(result),
				Ok(None) => break,
				Err(err) => return Some(Err(err)),
			}
		}
		if chunk.is_empty() {
			None
		} else {
			Some(Ok(chunk))
		}
	}
}

/// Writes to a file in the [FASTA format](https://en.wikipedia.org/wiki/FASTA_format).
pub struct Writer<'a, W: Write> {
	buffer: io::BufWriter<W>,
	separator: &'a str,
	wrap: bool,
}

impl<'a, W: Write> Writer<'a, W> {
	/// Constructs a writer from the specified Write. Items are printed
	/// separated by separator.
	pub fn new(write: W, separator: &'a str, wrap: bool) -> Self {
		Writer { buffer: io::BufWriter::new(write),
		         separator: separator,
		         wrap: wrap }
	}

	/// Convenience method, see [write_record_ref](#method.write_record_ref).
	pub fn write_record(&mut self, record: Record) -> Result<()> {
		self.write_record_ref(&record)
	}

	/// Writes a Record to the Write, in FASTA format
	pub fn write_record_ref(&mut self, record: &Record) -> Result<()> {
		write!(self.buffer, ">{}", record.header)?;
		let sequence = record.sequence.join(self.separator);
		if !self.wrap {
			self.buffer.write_all(&[b'\n'])?;
			self.buffer.write(sequence.as_bytes())?;
		} else {
			for subseq in sequence.as_bytes().chunks(FASTA_WIDTH) {
				self.buffer.write_all(&[b'\n'])?;
				self.buffer.write_all(subseq)?;
			}
		}
		if !sequence.is_empty() {
			self.buffer.write_all(&[b'\n'])?;
		}
		Ok(())
	}
}