1use std::path::Path;
7
8use cyanea_core::{Annotated, CyaneaError, Result, Sequence, Summarizable};
9
10use crate::quality::{PhredEncoding, QualityScores};
11use crate::types::DnaSequence;
12
13#[derive(Debug, Clone)]
15#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
16pub struct FastqRecord {
17 name: String,
18 description: Option<String>,
19 sequence: DnaSequence,
20 quality: QualityScores,
21}
22
23impl FastqRecord {
24 pub fn new(
28 name: String,
29 description: Option<String>,
30 sequence: DnaSequence,
31 quality: QualityScores,
32 ) -> Result<Self> {
33 if sequence.len() != quality.len() {
34 return Err(CyaneaError::InvalidInput(format!(
35 "sequence length ({}) does not match quality length ({})",
36 sequence.len(),
37 quality.len()
38 )));
39 }
40 Ok(Self {
41 name,
42 description,
43 sequence,
44 quality,
45 })
46 }
47
48 pub fn sequence(&self) -> &DnaSequence {
50 &self.sequence
51 }
52
53 pub fn quality(&self) -> &QualityScores {
55 &self.quality
56 }
57}
58
59impl Sequence for FastqRecord {
60 fn as_bytes(&self) -> &[u8] {
61 self.sequence.as_bytes()
62 }
63}
64
65impl Annotated for FastqRecord {
66 fn name(&self) -> &str {
67 &self.name
68 }
69
70 fn description(&self) -> Option<&str> {
71 self.description.as_deref()
72 }
73}
74
75impl Summarizable for FastqRecord {
76 fn summary(&self) -> String {
77 format!(
78 "FASTQ {} ({} bp, mean Q{:.1})",
79 self.name,
80 self.sequence.len(),
81 self.quality.mean()
82 )
83 }
84}
85
86#[derive(Debug, Clone)]
88#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
89pub struct FastqStats {
90 pub sequence_count: u64,
91 pub total_bases: u64,
92 pub gc_content: f64,
93 pub avg_length: f64,
94 pub mean_quality: f64,
95 pub q20_fraction: f64,
96 pub q30_fraction: f64,
97}
98
99pub fn parse_fastq_file(path: impl AsRef<Path>) -> Result<Vec<FastqRecord>> {
103 let path = path.as_ref();
104 let mut reader =
105 needletail::parse_fastx_file(path).map_err(|e| CyaneaError::Parse(e.to_string()))?;
106
107 let mut records = Vec::new();
108 while let Some(record) = reader.next() {
109 let record = record.map_err(|e| CyaneaError::Parse(e.to_string()))?;
110
111 let raw_id = std::str::from_utf8(record.id())
112 .map_err(|e| CyaneaError::Parse(e.to_string()))?;
113
114 let (name, description) = match raw_id.split_once(char::is_whitespace) {
116 Some((n, d)) => (n.to_string(), Some(d.to_string())),
117 None => (raw_id.to_string(), None),
118 };
119
120 let sequence = DnaSequence::new(record.seq())?;
121
122 let qual_bytes = record
123 .qual()
124 .ok_or_else(|| CyaneaError::Parse("missing quality scores".into()))?;
125 let quality = QualityScores::from_ascii(qual_bytes, PhredEncoding::Phred33)?;
126
127 records.push(FastqRecord::new(name, description, sequence, quality)?);
128 }
129
130 Ok(records)
131}
132
133pub fn parse_fastq_stats(path: impl AsRef<Path>) -> Result<FastqStats> {
135 let path = path.as_ref();
136 let mut reader =
137 needletail::parse_fastx_file(path).map_err(|e| CyaneaError::Parse(e.to_string()))?;
138
139 let mut sequence_count: u64 = 0;
140 let mut total_bases: u64 = 0;
141 let mut gc_count: u64 = 0;
142 let mut quality_sum: u64 = 0;
143 let mut q20_count: u64 = 0;
144 let mut q30_count: u64 = 0;
145
146 while let Some(record) = reader.next() {
147 let record = record.map_err(|e| CyaneaError::Parse(e.to_string()))?;
148 let seq = record.seq();
149
150 sequence_count += 1;
151 total_bases += seq.len() as u64;
152
153 for &base in seq.iter() {
154 match base.to_ascii_uppercase() {
155 b'G' | b'C' => gc_count += 1,
156 _ => {}
157 }
158 }
159
160 if let Some(qual) = record.qual() {
161 for &q in qual {
162 let phred = q.saturating_sub(33);
163 quality_sum += phred as u64;
164 if phred >= 20 {
165 q20_count += 1;
166 }
167 if phred >= 30 {
168 q30_count += 1;
169 }
170 }
171 }
172 }
173
174 let gc_content = if total_bases > 0 {
175 gc_count as f64 / total_bases as f64
176 } else {
177 0.0
178 };
179
180 let avg_length = if sequence_count > 0 {
181 total_bases as f64 / sequence_count as f64
182 } else {
183 0.0
184 };
185
186 let mean_quality = if total_bases > 0 {
187 quality_sum as f64 / total_bases as f64
188 } else {
189 0.0
190 };
191
192 let q20_fraction = if total_bases > 0 {
193 q20_count as f64 / total_bases as f64
194 } else {
195 0.0
196 };
197
198 let q30_fraction = if total_bases > 0 {
199 q30_count as f64 / total_bases as f64
200 } else {
201 0.0
202 };
203
204 Ok(FastqStats {
205 sequence_count,
206 total_bases,
207 gc_content,
208 avg_length,
209 mean_quality,
210 q20_fraction,
211 q30_fraction,
212 })
213}
214
215#[cfg(test)]
216mod tests {
217 use super::*;
218 use std::io::Write;
219 use tempfile::NamedTempFile;
220
221 #[test]
222 fn length_mismatch_error() {
223 let seq = DnaSequence::new(b"ACGT").unwrap();
224 let qual = QualityScores::from_raw(vec![30, 30, 30]); let result = FastqRecord::new("test".into(), None, seq, qual);
226 assert!(result.is_err());
227 }
228
229 #[test]
230 fn annotated_trait() {
231 let seq = DnaSequence::new(b"ACGT").unwrap();
232 let qual = QualityScores::from_raw(vec![30, 30, 30, 30]);
233 let record =
234 FastqRecord::new("read1".into(), Some("sample A".into()), seq, qual).unwrap();
235 assert_eq!(record.name(), "read1");
236 assert_eq!(record.description(), Some("sample A"));
237 }
238
239 #[test]
240 fn parse_temp_fastq() {
241 let mut file = NamedTempFile::new().unwrap();
242 writeln!(file, "@read1 test read").unwrap();
244 writeln!(file, "ACGTACGT").unwrap();
245 writeln!(file, "+").unwrap();
246 writeln!(file, "IIIIIIII").unwrap(); file.flush().unwrap();
248
249 let records = parse_fastq_file(file.path()).unwrap();
250 assert_eq!(records.len(), 1);
251 assert_eq!(records[0].name(), "read1");
252 assert_eq!(records[0].description(), Some("test read"));
253 assert_eq!(records[0].sequence().as_bytes(), b"ACGTACGT");
254 assert_eq!(records[0].quality().as_slice(), &[40; 8]);
255 }
256}