Skip to main content

argenus/
seqio.rs

1
2use anyhow::{Context, Result};
3use flate2::read::MultiGzDecoder;
4use std::fs::File;
5use std::io::{BufRead, BufReader, Read};
6use std::path::Path;
7
8#[derive(Debug, Clone)]
9pub struct FastaRecord {
10
11    pub name: String,
12
13    pub seq: String,
14}
15
16pub struct FastaReader {
17    reader: BufReader<File>,
18    line_buf: String,
19    current_name: Option<String>,
20}
21
22impl FastaReader {
23
24    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
25        let file = File::open(path.as_ref())
26            .with_context(|| format!("Failed to open FASTA: {}", path.as_ref().display()))?;
27        let mut reader = Self {
28            reader: BufReader::with_capacity(1024 * 1024, file),
29            line_buf: String::with_capacity(256),
30            current_name: None,
31        };
32
33        reader.line_buf.clear();
34        if reader.reader.read_line(&mut reader.line_buf)? > 0
35            && reader.line_buf.starts_with('>') {
36                reader.current_name = Some(
37                    reader.line_buf[1..]
38                        .split_whitespace()
39                        .next()
40                        .unwrap_or("")
41                        .to_string(),
42                );
43            }
44
45        Ok(reader)
46    }
47
48    pub fn read_next(&mut self) -> Result<Option<FastaRecord>> {
49        let name = match self.current_name.take() {
50            Some(n) => n,
51            None => return Ok(None),
52        };
53
54        let mut seq = String::with_capacity(10000);
55
56        loop {
57            self.line_buf.clear();
58            if self.reader.read_line(&mut self.line_buf)? == 0 {
59
60                break;
61            }
62
63            if self.line_buf.starts_with('>') {
64
65                self.current_name = Some(
66                    self.line_buf[1..]
67                        .split_whitespace()
68                        .next()
69                        .unwrap_or("")
70                        .to_string(),
71                );
72                break;
73            } else {
74
75                seq.push_str(self.line_buf.trim_end());
76            }
77        }
78
79        Ok(Some(FastaRecord { name, seq }))
80    }
81}
82
83impl Iterator for FastaReader {
84    type Item = Result<FastaRecord>;
85
86    fn next(&mut self) -> Option<Self::Item> {
87        match self.read_next() {
88            Ok(Some(record)) => Some(Ok(record)),
89            Ok(None) => None,
90            Err(e) => Some(Err(e)),
91        }
92    }
93}
94
95#[derive(Debug, Clone)]
96pub struct FastqRecord {
97
98    pub name: String,
99
100    pub seq: String,
101
102    pub qual: String,
103}
104
105pub struct FastqReader<R: Read> {
106    reader: BufReader<R>,
107    line_buf: String,
108}
109
110impl FastqReader<File> {
111
112    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
113        let file = File::open(path.as_ref())
114            .with_context(|| format!("Failed to open FASTQ: {}", path.as_ref().display()))?;
115        Ok(Self {
116            reader: BufReader::with_capacity(1024 * 1024, file),
117            line_buf: String::with_capacity(512),
118        })
119    }
120}
121
122impl FastqReader<MultiGzDecoder<File>> {
123
124    pub fn open_gz<P: AsRef<Path>>(path: P) -> Result<Self> {
125        let file = File::open(path.as_ref())
126            .with_context(|| format!("Failed to open FASTQ.gz: {}", path.as_ref().display()))?;
127        let decoder = MultiGzDecoder::new(file);
128        Ok(Self {
129            reader: BufReader::with_capacity(1024 * 1024, decoder),
130            line_buf: String::with_capacity(512),
131        })
132    }
133}
134
135impl<R: Read> FastqReader<R> {
136
137    pub fn read_next(&mut self) -> Result<Option<FastqRecord>> {
138
139        self.line_buf.clear();
140        if self.reader.read_line(&mut self.line_buf)? == 0 {
141            return Ok(None);
142        }
143        let name = self.line_buf.trim_start_matches('@').trim_end().to_string();
144        if name.is_empty() {
145            return Ok(None);
146        }
147
148        self.line_buf.clear();
149        self.reader.read_line(&mut self.line_buf)?;
150        let seq = self.line_buf.trim_end().to_string();
151
152        self.line_buf.clear();
153        self.reader.read_line(&mut self.line_buf)?;
154
155        self.line_buf.clear();
156        self.reader.read_line(&mut self.line_buf)?;
157        let qual = self.line_buf.trim_end().to_string();
158
159        Ok(Some(FastqRecord { name, seq, qual }))
160    }
161}
162
163pub enum FastqFile {
164
165    Plain(FastqReader<File>),
166
167    Gzipped(FastqReader<MultiGzDecoder<File>>),
168}
169
170impl FastqFile {
171
172    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
173        let path = path.as_ref();
174        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
175
176        if ext == "gz" {
177            Ok(FastqFile::Gzipped(FastqReader::open_gz(path)?))
178        } else {
179            Ok(FastqFile::Plain(FastqReader::open(path)?))
180        }
181    }
182
183    pub fn read_next(&mut self) -> Result<Option<FastqRecord>> {
184        match self {
185            FastqFile::Plain(r) => r.read_next(),
186            FastqFile::Gzipped(r) => r.read_next(),
187        }
188    }
189}
190
191#[cfg(test)]
192mod tests {
193    use super::*;
194
195    #[test]
196    fn test_fastq_record() {
197        let record = FastqRecord {
198            name: "read1".to_string(),
199            seq: "ATGC".to_string(),
200            qual: "IIII".to_string(),
201        };
202        assert_eq!(record.seq.len(), record.qual.len());
203    }
204}