Skip to main content

cyanea_seq/
fastq.rs

1//! FASTQ record type and parsing.
2//!
3//! [`FastqRecord`] wraps a [`DnaSequence`] with quality scores and metadata.
4//! Parsing uses needletail for streaming FASTQ reading.
5
6use std::path::Path;
7
8use cyanea_core::{Annotated, CyaneaError, Result, Sequence, Summarizable};
9
10use crate::quality::{PhredEncoding, QualityScores};
11use crate::types::DnaSequence;
12
13/// A single FASTQ record: name, sequence, and quality scores.
14#[derive(Debug, Clone)]
15#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
16pub struct FastqRecord {
17    name: String,
18    description: Option<String>,
19    sequence: DnaSequence,
20    quality: QualityScores,
21}
22
23impl FastqRecord {
24    /// Create a new FASTQ record.
25    ///
26    /// Returns an error if the sequence and quality lengths don't match.
27    pub fn new(
28        name: String,
29        description: Option<String>,
30        sequence: DnaSequence,
31        quality: QualityScores,
32    ) -> Result<Self> {
33        if sequence.len() != quality.len() {
34            return Err(CyaneaError::InvalidInput(format!(
35                "sequence length ({}) does not match quality length ({})",
36                sequence.len(),
37                quality.len()
38            )));
39        }
40        Ok(Self {
41            name,
42            description,
43            sequence,
44            quality,
45        })
46    }
47
48    /// The underlying DNA sequence.
49    pub fn sequence(&self) -> &DnaSequence {
50        &self.sequence
51    }
52
53    /// The quality scores.
54    pub fn quality(&self) -> &QualityScores {
55        &self.quality
56    }
57}
58
59impl Sequence for FastqRecord {
60    fn as_bytes(&self) -> &[u8] {
61        self.sequence.as_bytes()
62    }
63}
64
65impl Annotated for FastqRecord {
66    fn name(&self) -> &str {
67        &self.name
68    }
69
70    fn description(&self) -> Option<&str> {
71        self.description.as_deref()
72    }
73}
74
75impl Summarizable for FastqRecord {
76    fn summary(&self) -> String {
77        format!(
78            "FASTQ {} ({} bp, mean Q{:.1})",
79            self.name,
80            self.sequence.len(),
81            self.quality.mean()
82        )
83    }
84}
85
86/// Summary statistics for a FASTQ file.
87#[derive(Debug, Clone)]
88#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
89pub struct FastqStats {
90    pub sequence_count: u64,
91    pub total_bases: u64,
92    pub gc_content: f64,
93    pub avg_length: f64,
94    pub mean_quality: f64,
95    pub q20_fraction: f64,
96    pub q30_fraction: f64,
97}
98
99/// Parse a FASTQ file into a vector of [`FastqRecord`]s.
100///
101/// Uses Phred+33 encoding for quality scores. All sequences are treated as DNA.
102pub fn parse_fastq_file(path: impl AsRef<Path>) -> Result<Vec<FastqRecord>> {
103    let path = path.as_ref();
104    let mut reader =
105        needletail::parse_fastx_file(path).map_err(|e| CyaneaError::Parse(e.to_string()))?;
106
107    let mut records = Vec::new();
108    while let Some(record) = reader.next() {
109        let record = record.map_err(|e| CyaneaError::Parse(e.to_string()))?;
110
111        let raw_id = std::str::from_utf8(record.id())
112            .map_err(|e| CyaneaError::Parse(e.to_string()))?;
113
114        // Split "name description" on first whitespace
115        let (name, description) = match raw_id.split_once(char::is_whitespace) {
116            Some((n, d)) => (n.to_string(), Some(d.to_string())),
117            None => (raw_id.to_string(), None),
118        };
119
120        let sequence = DnaSequence::new(record.seq())?;
121
122        let qual_bytes = record
123            .qual()
124            .ok_or_else(|| CyaneaError::Parse("missing quality scores".into()))?;
125        let quality = QualityScores::from_ascii(qual_bytes, PhredEncoding::Phred33)?;
126
127        records.push(FastqRecord::new(name, description, sequence, quality)?);
128    }
129
130    Ok(records)
131}
132
133/// Parse a FASTQ file and compute summary statistics without storing records.
134pub fn parse_fastq_stats(path: impl AsRef<Path>) -> Result<FastqStats> {
135    let path = path.as_ref();
136    let mut reader =
137        needletail::parse_fastx_file(path).map_err(|e| CyaneaError::Parse(e.to_string()))?;
138
139    let mut sequence_count: u64 = 0;
140    let mut total_bases: u64 = 0;
141    let mut gc_count: u64 = 0;
142    let mut quality_sum: u64 = 0;
143    let mut q20_count: u64 = 0;
144    let mut q30_count: u64 = 0;
145
146    while let Some(record) = reader.next() {
147        let record = record.map_err(|e| CyaneaError::Parse(e.to_string()))?;
148        let seq = record.seq();
149
150        sequence_count += 1;
151        total_bases += seq.len() as u64;
152
153        for &base in seq.iter() {
154            match base.to_ascii_uppercase() {
155                b'G' | b'C' => gc_count += 1,
156                _ => {}
157            }
158        }
159
160        if let Some(qual) = record.qual() {
161            for &q in qual {
162                let phred = q.saturating_sub(33);
163                quality_sum += phred as u64;
164                if phred >= 20 {
165                    q20_count += 1;
166                }
167                if phred >= 30 {
168                    q30_count += 1;
169                }
170            }
171        }
172    }
173
174    let gc_content = if total_bases > 0 {
175        gc_count as f64 / total_bases as f64
176    } else {
177        0.0
178    };
179
180    let avg_length = if sequence_count > 0 {
181        total_bases as f64 / sequence_count as f64
182    } else {
183        0.0
184    };
185
186    let mean_quality = if total_bases > 0 {
187        quality_sum as f64 / total_bases as f64
188    } else {
189        0.0
190    };
191
192    let q20_fraction = if total_bases > 0 {
193        q20_count as f64 / total_bases as f64
194    } else {
195        0.0
196    };
197
198    let q30_fraction = if total_bases > 0 {
199        q30_count as f64 / total_bases as f64
200    } else {
201        0.0
202    };
203
204    Ok(FastqStats {
205        sequence_count,
206        total_bases,
207        gc_content,
208        avg_length,
209        mean_quality,
210        q20_fraction,
211        q30_fraction,
212    })
213}
214
215#[cfg(test)]
216mod tests {
217    use super::*;
218    use std::io::Write;
219    use tempfile::NamedTempFile;
220
221    #[test]
222    fn length_mismatch_error() {
223        let seq = DnaSequence::new(b"ACGT").unwrap();
224        let qual = QualityScores::from_raw(vec![30, 30, 30]); // length 3 vs 4
225        let result = FastqRecord::new("test".into(), None, seq, qual);
226        assert!(result.is_err());
227    }
228
229    #[test]
230    fn annotated_trait() {
231        let seq = DnaSequence::new(b"ACGT").unwrap();
232        let qual = QualityScores::from_raw(vec![30, 30, 30, 30]);
233        let record =
234            FastqRecord::new("read1".into(), Some("sample A".into()), seq, qual).unwrap();
235        assert_eq!(record.name(), "read1");
236        assert_eq!(record.description(), Some("sample A"));
237    }
238
239    #[test]
240    fn parse_temp_fastq() {
241        let mut file = NamedTempFile::new().unwrap();
242        // Write a minimal FASTQ record
243        writeln!(file, "@read1 test read").unwrap();
244        writeln!(file, "ACGTACGT").unwrap();
245        writeln!(file, "+").unwrap();
246        writeln!(file, "IIIIIIII").unwrap(); // Q40 in Phred+33
247        file.flush().unwrap();
248
249        let records = parse_fastq_file(file.path()).unwrap();
250        assert_eq!(records.len(), 1);
251        assert_eq!(records[0].name(), "read1");
252        assert_eq!(records[0].description(), Some("test read"));
253        assert_eq!(records[0].sequence().as_bytes(), b"ACGTACGT");
254        assert_eq!(records[0].quality().as_slice(), &[40; 8]);
255    }
256}