use std::{
fs::File,
io::{Read, Write},
mem,
path::Path,
};
use traitsequence::interface::Sequence;
use crate::{
interface::{alphabet::Alphabet, sequence_store::SequenceStore},
io::{peekable_reader::PeekableReader, unzip_if_zipped},
};
use super::{error::IOError, zip, ZipFormat};
pub struct FastaRecord<Handle> {
pub id: String,
pub comment: String,
pub sequence_handle: Handle,
}
pub fn read_fasta_file<AlphabetType: Alphabet, SequenceStoreType: SequenceStore<AlphabetType>>(
path: impl AsRef<Path>,
store: &mut SequenceStoreType,
) -> Result<Vec<FastaRecord<SequenceStoreType::Handle>>, IOError> {
let zip_format_hint = ZipFormat::from_path_name(&path);
let file = File::open(path)?;
unzip_if_zipped(file, zip_format_hint, |reader| read_fasta(reader, store))
}
pub fn read_fasta<AlphabetType: Alphabet, SequenceStoreType: SequenceStore<AlphabetType>>(
reader: impl Read,
store: &mut SequenceStoreType,
) -> Result<Vec<FastaRecord<SequenceStoreType::Handle>>, IOError> {
enum State {
Init,
RecordId,
RecordWhitespace,
RecordComment,
RecordSequence,
EndOfRecord { has_more_records: bool },
}
let mut records = Vec::new();
let mut record_id = String::new();
let mut record_comment = String::new();
let mut record_sequence_handle: Option<SequenceStoreType::Handle> = None;
let mut reader = PeekableReader::new(reader);
let mut state = State::Init;
let mut buffer = Vec::<u8>::new();
loop {
match state {
State::Init => {
buffer.resize(1, 0);
let mut is_newline = true;
state = loop {
let result = reader.read_exact(&mut buffer);
if let Err(error) = result {
if matches!(error.kind(), std::io::ErrorKind::UnexpectedEof) {
return Ok(records);
} else {
return Err(error.into());
}
}
match buffer[0] {
b'\r' | b'\n' => is_newline = true,
b'>' => {
if is_newline {
break State::RecordId;
}
}
_ => is_newline = false,
}
};
}
State::RecordId => {
buffer.resize(1, 0);
state = loop {
let result = reader.read_exact(&mut buffer);
if let Err(error) = result {
if matches!(error.kind(), std::io::ErrorKind::UnexpectedEof) {
break State::EndOfRecord {
has_more_records: false,
};
} else {
return Err(error.into());
}
}
if buffer[0] == b'\n' || buffer[0] == b'\r' {
break State::RecordSequence;
} else if buffer[0].is_ascii_whitespace() {
break State::RecordWhitespace;
} else {
record_id.push(buffer[0].into());
}
};
}
State::RecordWhitespace => {
buffer.resize(1, 0);
state = loop {
let result = reader.read_exact(&mut buffer);
if let Err(error) = result {
if matches!(error.kind(), std::io::ErrorKind::UnexpectedEof) {
break State::EndOfRecord {
has_more_records: false,
};
} else {
return Err(error.into());
}
}
if buffer[0] == b'\n' || buffer[0] == b'\r' {
break State::RecordSequence;
} else if !buffer[0].is_ascii_whitespace() {
record_comment.push(buffer[0].into());
break State::RecordComment;
}
};
}
State::RecordComment => {
buffer.resize(1, 0);
state = loop {
let result = reader.read_exact(&mut buffer);
if let Err(error) = result {
if matches!(error.kind(), std::io::ErrorKind::UnexpectedEof) {
break State::EndOfRecord {
has_more_records: false,
};
} else {
return Err(error.into());
}
}
if buffer[0] == b'\n' || buffer[0] == b'\r' {
break State::RecordSequence;
} else {
record_comment.push(buffer[0].into());
}
};
}
State::RecordSequence => {
let mut iterator = FastaSequenceIterator {
reader: &mut reader,
buffer: Default::default(),
result: None,
newline: true,
};
record_sequence_handle = Some(store.add_from_iter_u8(&mut iterator)?);
state = State::EndOfRecord {
has_more_records: iterator.result.unwrap()?,
};
}
State::EndOfRecord { has_more_records } => {
let comment = record_comment.trim_end().to_string();
record_comment.clear();
let sequence_handle = record_sequence_handle
.take()
.unwrap_or_else(|| store.add_from_slice_u8(&[]).unwrap());
records.push(FastaRecord {
id: mem::take(&mut record_id),
comment,
sequence_handle,
});
if has_more_records {
state = State::RecordId;
} else {
break;
}
}
}
}
Ok(records)
}
struct FastaSequenceIterator<'a, Reader> {
reader: &'a mut Reader,
buffer: [u8; 1],
result: Option<Result<bool, IOError>>,
newline: bool,
}
impl<'a, Reader: Read> Iterator for FastaSequenceIterator<'a, Reader> {
type Item = u8;
fn next(&mut self) -> Option<Self::Item> {
if self.result.is_some() {
return None;
}
loop {
let result = self.reader.read_exact(&mut self.buffer);
if let Err(error) = result {
if matches!(error.kind(), std::io::ErrorKind::UnexpectedEof) {
self.result = Some(Ok(false));
return None;
} else {
self.result = Some(Err(error.into()));
return None;
}
}
if self.buffer[0] == b'>' && self.newline {
self.result = Some(Ok(true));
self.newline = false;
return None;
} else if self.buffer[0] != b'\n' && self.buffer[0] != b'\r' {
self.newline = false;
return Some(self.buffer[0]);
} else {
self.newline = true;
}
}
}
}
pub fn write_fasta_file<AlphabetType: Alphabet, SequenceStoreType: SequenceStore<AlphabetType>>(
path: impl AsRef<Path>,
records: impl IntoIterator<Item = FastaRecord<SequenceStoreType::Handle>>,
store: &SequenceStoreType,
) -> Result<(), IOError> {
let zip_format = ZipFormat::from_path_name(&path);
let file = File::create(path)?;
zip(file, zip_format, |writer| {
write_fasta(writer, records, store)
})
}
pub fn write_fasta<AlphabetType: Alphabet, SequenceStoreType: SequenceStore<AlphabetType>>(
mut writer: impl Write,
records: impl IntoIterator<Item = FastaRecord<SequenceStoreType::Handle>>,
store: &SequenceStoreType,
) -> Result<(), IOError> {
for record in records {
writeln!(
writer,
">{id}{space}{comment}",
id = record.id,
space = if record.comment.is_empty() { "" } else { " " },
comment = record.comment
)?;
let sequence = store.get(&record.sequence_handle);
for character in sequence.iter() {
write!(writer, "{character}")?;
}
writeln!(writer)?;
}
Ok(())
}
#[cfg(test)]
mod tests {
use core::str;
use crate::{
implementation::alphabets::dna_alphabet::DnaAlphabet, implementation::DefaultSequenceStore,
};
use super::{read_fasta, write_fasta};
#[test]
fn test_read_write() {
let input_file =
b">alt1 comment1\nGGTTGGCCT\n>f2\nACCTG\n>f3 \nAA\n>seq c2 \nGT".as_slice();
let expected_output_file =
b">alt1 comment1\nGGTTGGCCT\n>f2\nACCTG\n>f3\nAA\n>seq c2\nGT\n".as_slice();
let mut store = DefaultSequenceStore::<DnaAlphabet>::new();
let records = read_fasta(input_file, &mut store).unwrap();
let mut output_file = Vec::new();
write_fasta(&mut output_file, records, &store).unwrap();
assert_eq!(
expected_output_file,
output_file,
"expected output:\n{}\n\noutput:\n{}",
str::from_utf8(expected_output_file)
.unwrap()
.replace(' ', "_"),
str::from_utf8(&output_file).unwrap().replace(' ', "_"),
);
}
}