Skip to main content

cyanea_seq/
fasta.rs

1//! FASTA/FASTQ parsing and statistics.
2
3use std::path::Path;
4
5use cyanea_core::{CyaneaError, Result};
6use needletail::parse_fastx_file;
7
8/// Summary statistics for a FASTA/FASTQ file.
9#[derive(Debug, Clone)]
10#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
11pub struct FastaStats {
12    pub sequence_count: u64,
13    pub total_bases: u64,
14    pub gc_content: f64,
15    pub avg_length: f64,
16}
17
18/// Parse a FASTA/FASTQ file and compute summary statistics.
19pub fn parse_fasta_stats(path: impl AsRef<Path>) -> Result<FastaStats> {
20    let path = path.as_ref();
21    let mut reader = parse_fastx_file(path).map_err(|e| CyaneaError::Parse(e.to_string()))?;
22
23    let mut sequence_count: u64 = 0;
24    let mut total_bases: u64 = 0;
25    let mut gc_count: u64 = 0;
26
27    while let Some(record) = reader.next() {
28        let record = record.map_err(|e| CyaneaError::Parse(e.to_string()))?;
29        let seq = record.seq();
30
31        sequence_count += 1;
32        total_bases += seq.len() as u64;
33
34        for &base in seq.iter() {
35            match base {
36                b'G' | b'g' | b'C' | b'c' => gc_count += 1,
37                _ => {}
38            }
39        }
40    }
41
42    let gc_content = if total_bases > 0 {
43        (gc_count as f64 / total_bases as f64) * 100.0
44    } else {
45        0.0
46    };
47
48    let avg_length = if sequence_count > 0 {
49        total_bases as f64 / sequence_count as f64
50    } else {
51        0.0
52    };
53
54    Ok(FastaStats {
55        sequence_count,
56        total_bases,
57        gc_content,
58        avg_length,
59    })
60}
61
62#[cfg(test)]
63mod tests {
64    use super::*;
65    use std::io::Write;
66    use tempfile::NamedTempFile;
67
68    #[test]
69    fn test_fasta_parsing() {
70        let mut file = NamedTempFile::new().unwrap();
71        writeln!(file, ">seq1").unwrap();
72        writeln!(file, "ATCGATCG").unwrap();
73        writeln!(file, ">seq2").unwrap();
74        writeln!(file, "GCGCGCGC").unwrap();
75        file.flush().unwrap();
76
77        let stats = parse_fasta_stats(file.path()).unwrap();
78        assert_eq!(stats.sequence_count, 2);
79        assert_eq!(stats.total_bases, 16);
80        assert!((stats.gc_content - 75.0).abs() < 0.01);
81        assert!((stats.avg_length - 8.0).abs() < 0.01);
82    }
83
84    #[test]
85    fn test_empty_fasta() {
86        let file = NamedTempFile::new().unwrap();
87
88        // needletail returns an error for empty files
89        let result = parse_fasta_stats(file.path());
90        assert!(result.is_err());
91    }
92
93    #[test]
94    fn test_fasta_file_not_found() {
95        let result = parse_fasta_stats("/nonexistent/file.fasta");
96        assert!(result.is_err());
97    }
98}