use crate::core::seq_record::SeqRecord;
use anyhow::{Context, Result};
use bzip2::read::BzDecoder;
use flate2::read::GzDecoder;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
pub struct SeqReader<R: BufRead> {
reader: R,
buffer: Vec<u8>,
last_char: Option<u8>,
}
impl SeqReader<Box<dyn BufRead>> {
pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
let path = path.as_ref();
let file = File::open(path).with_context(|| format!("无法打开文件: {}", path.display()))?;
let reader: Box<dyn BufRead> = if path.extension().and_then(|s| s.to_str()) == Some("gz") {
Box::new(BufReader::with_capacity(16384, GzDecoder::new(file)))
} else if path.extension().and_then(|s| s.to_str()) == Some("bz2") {
Box::new(BufReader::with_capacity(16384, BzDecoder::new(file)))
} else {
Box::new(BufReader::with_capacity(16384, file))
};
Ok(Self::new(reader))
}
pub fn from_stdin() -> Self {
let stdin = std::io::stdin();
let reader = BufReader::with_capacity(16384, stdin);
Self::new(Box::new(reader))
}
}
impl<R: BufRead> SeqReader<R> {
pub fn new(reader: R) -> Self {
Self {
reader,
buffer: Vec::with_capacity(1024),
last_char: None,
}
}
pub fn read_next(&mut self, record: &mut SeqRecord) -> Result<bool> {
record.clear();
if self.last_char.is_none() {
loop {
let mut ch = [0u8; 1];
match self.reader.read(&mut ch) {
Ok(0) => return Ok(false), Ok(_) => {
if ch[0] == b'>' || ch[0] == b'@' {
self.last_char = Some(ch[0]);
break;
}
}
Err(e) => return Err(e.into()),
}
}
}
let is_fastq = self.last_char == Some(b'@');
self.buffer.clear();
let n = self.reader.read_until(b'\n', &mut self.buffer)?;
if n == 0 {
return Ok(false);
}
if let Some(space_pos) = self.buffer.iter().position(|&b| b == b' ' || b == b'\t') {
record.name = self.buffer[..space_pos].to_vec();
let comment_start = space_pos + 1;
let comment_end = if self.buffer.ends_with(b"\n") {
self.buffer.len() - 1
} else {
self.buffer.len()
};
if comment_end > comment_start {
record.comment = Some(self.buffer[comment_start..comment_end].to_vec());
}
} else {
let end = if self.buffer.ends_with(b"\n") {
self.buffer.len() - 1
} else {
self.buffer.len()
};
record.name = self.buffer[..end].to_vec();
}
loop {
let buf = self.reader.fill_buf()?;
if buf.is_empty() {
break; }
let ch = buf[0];
if ch == b'>' || ch == b'@' || ch == b'+' {
self.last_char = Some(ch);
self.reader.consume(1);
break;
}
self.buffer.clear();
self.reader.read_until(b'\n', &mut self.buffer)?;
let end = if self.buffer.ends_with(b"\n") {
self.buffer.len() - 1
} else {
self.buffer.len()
};
if end > 0 {
record.seq.extend_from_slice(&self.buffer[..end]);
}
}
if is_fastq {
self.buffer.clear();
self.reader.read_until(b'\n', &mut self.buffer)?;
let mut qual = Vec::with_capacity(record.seq.len());
while qual.len() < record.seq.len() {
self.buffer.clear();
let n = self.reader.read_until(b'\n', &mut self.buffer)?;
if n == 0 {
anyhow::bail!("质量值字符串被截断");
}
let end = if self.buffer.ends_with(b"\n") {
self.buffer.len() - 1
} else {
self.buffer.len()
};
qual.extend_from_slice(&self.buffer[..end]);
}
if qual.len() != record.seq.len() {
anyhow::bail!("质量值长度与序列长度不匹配");
}
record.qual = Some(qual);
self.last_char = None;
}
Ok(true)
}
}
impl<R: BufRead> Iterator for SeqReader<R> {
type Item = Result<SeqRecord>;
fn next(&mut self) -> Option<Self::Item> {
let mut record = SeqRecord::new(Vec::new(), Vec::new());
match self.read_next(&mut record) {
Ok(true) => Some(Ok(record)),
Ok(false) => None,
Err(e) => Some(Err(e)),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
#[test]
fn test_read_fasta() {
let fasta_data = b">seq1 comment\nACGT\nACGT\n>seq2\nTGCA\n";
let cursor = Cursor::new(fasta_data);
let reader = BufReader::new(cursor);
let mut seq_reader = SeqReader::new(reader);
let mut record = SeqRecord::new(Vec::new(), Vec::new());
assert!(seq_reader.read_next(&mut record).unwrap());
assert_eq!(record.name, b"seq1");
assert_eq!(record.comment, Some(b"comment".to_vec()));
assert_eq!(record.seq, b"ACGTACGT");
assert!(!record.is_fastq());
assert!(seq_reader.read_next(&mut record).unwrap());
assert_eq!(record.name, b"seq2");
assert_eq!(record.seq, b"TGCA");
assert!(!seq_reader.read_next(&mut record).unwrap());
}
#[test]
fn test_read_fastq() {
let fastq_data = b"@seq1\nACGT\n+\nIIII\n@seq2\nTGCA\n+\nJJJJ\n";
let cursor = Cursor::new(fastq_data);
let reader = BufReader::new(cursor);
let mut seq_reader = SeqReader::new(reader);
let mut record = SeqRecord::new(Vec::new(), Vec::new());
assert!(seq_reader.read_next(&mut record).unwrap());
assert_eq!(record.name, b"seq1");
assert_eq!(record.seq, b"ACGT");
assert_eq!(record.qual, Some(b"IIII".to_vec()));
assert!(record.is_fastq());
assert!(seq_reader.read_next(&mut record).unwrap());
assert_eq!(record.name, b"seq2");
assert_eq!(record.seq, b"TGCA");
assert_eq!(record.qual, Some(b"JJJJ".to_vec()));
}
#[test]
fn test_iterator() {
let fasta_data = b">seq1\nACGT\n>seq2\nTGCA\n";
let cursor = Cursor::new(fasta_data);
let reader = BufReader::new(cursor);
let seq_reader = SeqReader::new(reader);
let records: Vec<_> = seq_reader.map(|r| r.unwrap()).collect();
assert_eq!(records.len(), 2);
assert_eq!(records[0].name, b"seq1");
assert_eq!(records[1].name, b"seq2");
}
}