Skip to main content

sshash_lib/builder/
parse.rs

1//! FASTA/FASTQ parsing with automatic decompression
2//!
3//! Reads DNA sequences from FASTA or FASTQ files, with transparent
4//! gzip decompression. Validates DNA alphabet (A, C, G, T only).
5
6use anyhow::{Context, Result};
7use needletail::parse_fastx_file;
8use std::path::Path;
9
10/// Parse a FASTA/FASTQ file and call a function for each valid DNA sequence
11///
12/// # Arguments
13/// * `path` - Path to input file (may be gzipped)
14/// * `callback` - Function called for each sequence, receives (name, sequence)
15///
16/// # Errors
17/// Returns error if:
18/// - File cannot be opened
19/// - File format is invalid
20/// - Sequence contains non-DNA characters
21pub fn parse_sequences<P, F>(path: P, mut callback: F) -> Result<()>
22where
23    P: AsRef<Path>,
24    F: FnMut(&[u8], &[u8]) -> Result<()>,
25{
26    let path = path.as_ref();
27    
28    // needletail automatically handles gzip decompression
29    let mut reader = parse_fastx_file(path)
30        .with_context(|| format!("Failed to open sequence file: {}", path.display()))?;
31    
32    while let Some(record) = reader.next() {
33        let record = record
34            .with_context(|| format!("Failed to parse sequence record in {}", path.display()))?;
35        
36        // Validate DNA alphabet (seq is Cow<[u8]>, convert to &[u8])
37        let seq = record.seq();
38        validate_dna_sequence(&seq)
39            .with_context(|| format!("Invalid DNA sequence in {}", path.display()))?;
40        
41        // Call user callback with borrowed slices
42        callback(record.id(), &seq)?;
43    }
44    
45    Ok(())
46}
47
48/// Validate that a sequence contains only valid DNA bases (A, C, G, T)
49///
50/// # Arguments
51/// * `seq` - Sequence bytes to validate
52///
53/// # Errors
54/// Returns error if sequence contains non-ACGT characters
55pub fn validate_dna_sequence(seq: &[u8]) -> Result<()> {
56    for (i, &base) in seq.iter().enumerate() {
57        match base {
58            b'A' | b'C' | b'G' | b'T' | b'a' | b'c' | b'g' | b't' => {}
59            _ => {
60                return Err(anyhow::anyhow!(
61                    "Invalid DNA base '{}' at position {}. Only A, C, G, T are allowed.",
62                    base as char,
63                    i
64                ));
65            }
66        }
67    }
68    Ok(())
69}
70
71/// Count sequences and total bases in a file
72///
73/// # Arguments
74/// * `path` - Path to input file
75///
76/// # Returns
77/// `(num_sequences, total_bases)`
78pub fn count_sequences<P: AsRef<Path>>(path: P) -> Result<(usize, usize)> {
79    let mut num_sequences = 0;
80    let mut total_bases = 0;
81    
82    parse_sequences(path, |_name, seq| {
83        num_sequences += 1;
84        total_bases += seq.len();
85        Ok(())
86    })?;
87    
88    Ok((num_sequences, total_bases))
89}
90
91#[cfg(test)]
92mod tests {
93    use super::*;
94    use std::io::Write;
95    use tempfile::NamedTempFile;
96    
97    #[test]
98    fn test_validate_dna_sequence_valid() {
99        assert!(validate_dna_sequence(b"ACGT").is_ok());
100        assert!(validate_dna_sequence(b"acgt").is_ok());
101        assert!(validate_dna_sequence(b"ACGTacgt").is_ok());
102    }
103    
104    #[test]
105    fn test_validate_dna_sequence_invalid() {
106        assert!(validate_dna_sequence(b"ACGTN").is_err());  // N
107        assert!(validate_dna_sequence(b"ACGT ").is_err());   // Space
108        assert!(validate_dna_sequence(b"ACG-T").is_err());   // Dash
109    }
110    
111    #[test]
112    fn test_parse_fasta_file() -> Result<()> {
113        let mut temp_file = NamedTempFile::new()?;
114        writeln!(temp_file, ">seq1")?;
115        writeln!(temp_file, "ACGT")?;
116        writeln!(temp_file, ">seq2")?;
117        writeln!(temp_file, "TGCA")?;
118        temp_file.flush()?;
119        
120        let mut sequences = Vec::new();
121        parse_sequences(temp_file.path(), |name, seq| {
122            sequences.push((name.to_vec(), seq.to_vec()));
123            Ok(())
124        })?;
125        
126        assert_eq!(sequences.len(), 2);
127        assert_eq!(sequences[0].0, b"seq1");
128        assert_eq!(sequences[0].1, b"ACGT");
129        assert_eq!(sequences[1].0, b"seq2");
130        assert_eq!(sequences[1].1, b"TGCA");
131        
132        Ok(())
133    }
134    
135    #[test]
136    fn test_count_sequences() -> Result<()> {
137        let mut temp_file = NamedTempFile::new()?;
138        writeln!(temp_file, ">seq1")?;
139        writeln!(temp_file, "ACGT")?;
140        writeln!(temp_file, ">seq2")?;
141        writeln!(temp_file, "TGCATGCA")?;
142        temp_file.flush()?;
143        
144        let (num_seqs, total_bases) = count_sequences(temp_file.path())?;
145        assert_eq!(num_seqs, 2);
146        assert_eq!(total_bases, 12);  // 4 + 8
147        
148        Ok(())
149    }
150}