prseq/
fastq.rs

1use crate::common::create_reader_with_compression;
2use std::fs::File;
3use std::io::{BufRead, Read, Result};
4use std::path::Path;
5
6/// Represents a single FASTQ sequence record
7#[derive(Debug, Clone, PartialEq)]
8pub struct FastqRecord {
9    pub id: String,
10    pub sequence: String,
11    pub quality: String,
12}
13
14/// Iterator over FASTQ records from any readable source
15pub struct FastqReader {
16    lines: std::io::Lines<std::io::BufReader<Box<dyn Read + Send>>>,
17    sequence_size_hint: usize,
18}
19
20impl FastqReader {
21    /// Create a new FastqReader from a file path
22    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
23        Self::from_file_with_capacity(path, 64 * 1024)
24    }
25
26    /// Create a new FastqReader from a file path with a sequence size hint
27    ///
28    /// The size_hint helps optimize memory allocation for sequence data.
29    /// Use smaller values (e.g., 100-1000) for short sequences like primers,
30    /// or larger values (e.g., 50000+) for genomes or long sequences.
31    pub fn from_file_with_capacity<P: AsRef<Path>>(
32        path: P,
33        sequence_size_hint: usize,
34    ) -> Result<Self> {
35        let file = File::open(path)?;
36        Self::from_reader_with_capacity(file, sequence_size_hint)
37    }
38
39    /// Create a new FastqReader from stdin
40    pub fn from_stdin() -> Result<Self> {
41        Self::from_stdin_with_capacity(64 * 1024)
42    }
43
44    /// Create a new FastqReader from stdin with a sequence size hint
45    pub fn from_stdin_with_capacity(sequence_size_hint: usize) -> Result<Self> {
46        let stdin = std::io::stdin();
47        Self::from_reader_with_capacity(stdin, sequence_size_hint)
48    }
49
50    /// Create a new FastqReader from any readable source with compression detection
51    pub fn from_reader_with_capacity<R: Read + Send + 'static>(
52        reader: R,
53        sequence_size_hint: usize,
54    ) -> Result<Self> {
55        let buf_reader = create_reader_with_compression(reader)?;
56        let lines = buf_reader.lines();
57
58        Ok(FastqReader {
59            lines,
60            sequence_size_hint: sequence_size_hint.max(64),
61        })
62    }
63
64    fn read_next(&mut self) -> Result<Option<FastqRecord>> {
65        // Read header line (@id)
66        let id = loop {
67            match self.lines.next() {
68                Some(Ok(line)) => {
69                    if line.is_empty() || line.chars().all(|c| c.is_whitespace()) {
70                        continue;
71                    }
72                    let trimmed = line.trim();
73                    if !trimmed.starts_with('@') {
74                        return Err(std::io::Error::new(
75                            std::io::ErrorKind::InvalidData,
76                            "FASTQ record must start with '@'",
77                        ));
78                    }
79                    break trimmed[1..].to_string();
80                }
81                Some(Err(e)) => return Err(e),
82                None => return Ok(None),
83            }
84        };
85
86        // Read sequence lines (until we hit a '+' line)
87        let mut sequence = String::with_capacity(self.sequence_size_hint);
88        let plus_line = loop {
89            match self.lines.next() {
90                Some(Ok(line)) => {
91                    let trimmed = line.trim();
92                    if trimmed.starts_with('+') {
93                        break trimmed.to_string();
94                    }
95                    if !line.is_empty() && !line.chars().all(|c| c.is_whitespace()) {
96                        sequence.push_str(trimmed);
97                    }
98                }
99                Some(Err(e)) => return Err(e),
100                None => {
101                    return Err(std::io::Error::new(
102                        std::io::ErrorKind::UnexpectedEof,
103                        "Unexpected end of file while reading FASTQ sequence",
104                    ));
105                }
106            }
107        };
108
109        // Validate the '+' line if it contains an ID
110        if plus_line.len() > 1 {
111            let plus_id = &plus_line[1..];
112            if plus_id != id {
113                return Err(std::io::Error::new(
114                    std::io::ErrorKind::InvalidData,
115                    format!(
116                        "FASTQ '+' line ID '{}' does not match header ID '{}'",
117                        plus_id, id
118                    ),
119                ));
120            }
121        }
122
123        // Read quality lines (must match sequence length)
124        let mut quality = String::with_capacity(sequence.len());
125        let sequence_len = sequence.len();
126
127        while quality.len() < sequence_len {
128            match self.lines.next() {
129                Some(Ok(line)) => {
130                    let trimmed = line.trim();
131                    if !line.is_empty() && !line.chars().all(|c| c.is_whitespace()) {
132                        // Only add as many characters as we need
133                        let needed = sequence_len - quality.len();
134                        let to_add = if trimmed.len() <= needed {
135                            trimmed
136                        } else {
137                            &trimmed[..needed]
138                        };
139                        quality.push_str(to_add);
140                    }
141                }
142                Some(Err(e)) => return Err(e),
143                None => {
144                    return Err(std::io::Error::new(
145                        std::io::ErrorKind::UnexpectedEof,
146                        "Unexpected end of file while reading FASTQ quality scores",
147                    ));
148                }
149            }
150        }
151
152        // Validate that sequence and quality have the same length
153        if sequence.len() != quality.len() {
154            return Err(std::io::Error::new(
155                std::io::ErrorKind::InvalidData,
156                format!(
157                    "FASTQ sequence length ({}) does not match quality length ({})",
158                    sequence.len(),
159                    quality.len()
160                ),
161            ));
162        }
163
164        Ok(Some(FastqRecord {
165            id,
166            sequence,
167            quality,
168        }))
169    }
170}
171
172impl Iterator for FastqReader {
173    type Item = Result<FastqRecord>;
174
175    fn next(&mut self) -> Option<Self::Item> {
176        match self.read_next() {
177            Ok(Some(record)) => Some(Ok(record)),
178            Ok(None) => None,
179            Err(e) => Some(Err(e)),
180        }
181    }
182}
183
184pub fn read_fastq<P: AsRef<Path>>(path: P) -> Result<Vec<FastqRecord>> {
185    read_fastq_with_capacity(path, 64 * 1024)
186}
187
188pub fn read_fastq_with_capacity<P: AsRef<Path>>(
189    path: P,
190    sequence_size_hint: usize,
191) -> Result<Vec<FastqRecord>> {
192    let reader = FastqReader::from_file_with_capacity(path, sequence_size_hint)?;
193    reader.collect()
194}