sshash_lib/builder/
parse.rs1use anyhow::{Context, Result};
7use needletail::parse_fastx_file;
8use std::path::Path;
9
10pub fn parse_sequences<P, F>(path: P, mut callback: F) -> Result<()>
22where
23 P: AsRef<Path>,
24 F: FnMut(&[u8], &[u8]) -> Result<()>,
25{
26 let path = path.as_ref();
27
28 let mut reader = parse_fastx_file(path)
30 .with_context(|| format!("Failed to open sequence file: {}", path.display()))?;
31
32 while let Some(record) = reader.next() {
33 let record = record
34 .with_context(|| format!("Failed to parse sequence record in {}", path.display()))?;
35
36 let seq = record.seq();
38 validate_dna_sequence(&seq)
39 .with_context(|| format!("Invalid DNA sequence in {}", path.display()))?;
40
41 callback(record.id(), &seq)?;
43 }
44
45 Ok(())
46}
47
48pub fn validate_dna_sequence(seq: &[u8]) -> Result<()> {
56 for (i, &base) in seq.iter().enumerate() {
57 match base {
58 b'A' | b'C' | b'G' | b'T' | b'a' | b'c' | b'g' | b't' => {}
59 _ => {
60 return Err(anyhow::anyhow!(
61 "Invalid DNA base '{}' at position {}. Only A, C, G, T are allowed.",
62 base as char,
63 i
64 ));
65 }
66 }
67 }
68 Ok(())
69}
70
71pub fn count_sequences<P: AsRef<Path>>(path: P) -> Result<(usize, usize)> {
79 let mut num_sequences = 0;
80 let mut total_bases = 0;
81
82 parse_sequences(path, |_name, seq| {
83 num_sequences += 1;
84 total_bases += seq.len();
85 Ok(())
86 })?;
87
88 Ok((num_sequences, total_bases))
89}
90
91#[cfg(test)]
92mod tests {
93 use super::*;
94 use std::io::Write;
95 use tempfile::NamedTempFile;
96
97 #[test]
98 fn test_validate_dna_sequence_valid() {
99 assert!(validate_dna_sequence(b"ACGT").is_ok());
100 assert!(validate_dna_sequence(b"acgt").is_ok());
101 assert!(validate_dna_sequence(b"ACGTacgt").is_ok());
102 }
103
104 #[test]
105 fn test_validate_dna_sequence_invalid() {
106 assert!(validate_dna_sequence(b"ACGTN").is_err()); assert!(validate_dna_sequence(b"ACGT ").is_err()); assert!(validate_dna_sequence(b"ACG-T").is_err()); }
110
111 #[test]
112 fn test_parse_fasta_file() -> Result<()> {
113 let mut temp_file = NamedTempFile::new()?;
114 writeln!(temp_file, ">seq1")?;
115 writeln!(temp_file, "ACGT")?;
116 writeln!(temp_file, ">seq2")?;
117 writeln!(temp_file, "TGCA")?;
118 temp_file.flush()?;
119
120 let mut sequences = Vec::new();
121 parse_sequences(temp_file.path(), |name, seq| {
122 sequences.push((name.to_vec(), seq.to_vec()));
123 Ok(())
124 })?;
125
126 assert_eq!(sequences.len(), 2);
127 assert_eq!(sequences[0].0, b"seq1");
128 assert_eq!(sequences[0].1, b"ACGT");
129 assert_eq!(sequences[1].0, b"seq2");
130 assert_eq!(sequences[1].1, b"TGCA");
131
132 Ok(())
133 }
134
135 #[test]
136 fn test_count_sequences() -> Result<()> {
137 let mut temp_file = NamedTempFile::new()?;
138 writeln!(temp_file, ">seq1")?;
139 writeln!(temp_file, "ACGT")?;
140 writeln!(temp_file, ">seq2")?;
141 writeln!(temp_file, "TGCATGCA")?;
142 temp_file.flush()?;
143
144 let (num_seqs, total_bases) = count_sequences(temp_file.path())?;
145 assert_eq!(num_seqs, 2);
146 assert_eq!(total_bases, 12); Ok(())
149 }
150}