Skip to main content

ref_solver/parsing/
fasta.rs

1//! Parser for FASTA files using noodles.
2//!
3//! Extracts contig names and lengths from FASTA files.
4//! Supports both uncompressed and gzip/bgzip compressed files.
5//!
6//! Supported extensions:
7//! - `.fa`, `.fasta`, `.fna` (uncompressed)
8//! - `.fa.gz`, `.fasta.gz`, `.fna.gz` (gzip compressed)
9//! - `.fa.bgz`, `.fasta.bgz`, `.fna.bgz` (bgzip compressed)
10
11use std::ffi::OsStr;
12use std::io::{BufRead, BufReader};
13use std::path::Path;
14
15use flate2::read::GzDecoder;
16use noodles::fasta;
17
18use crate::core::contig::Contig;
19use crate::core::header::QueryHeader;
20use crate::parsing::sam::ParseError;
21use crate::utils::validation::check_contig_limit;
22
23/// Check if the path has a FASTA extension
24pub fn is_fasta_file(path: &Path) -> bool {
25    let path_str = path.to_string_lossy().to_lowercase();
26
27    // Check for gzipped FASTA
28    if path_str.ends_with(".fa.gz")
29        || path_str.ends_with(".fasta.gz")
30        || path_str.ends_with(".fna.gz")
31        || path_str.ends_with(".fa.bgz")
32        || path_str.ends_with(".fasta.bgz")
33        || path_str.ends_with(".fna.bgz")
34    {
35        return true;
36    }
37
38    // Check for uncompressed FASTA
39    matches!(
40        path.extension()
41            .and_then(OsStr::to_str)
42            .map(str::to_lowercase)
43            .as_deref(),
44        Some("fa" | "fasta" | "fna")
45    )
46}
47
48/// Check if the path is a gzipped file
49#[allow(clippy::case_sensitive_file_extension_comparisons)] // Already lowercased
50fn is_gzipped(path: &Path) -> bool {
51    let path_str = path.to_string_lossy().to_lowercase();
52    path_str.ends_with(".gz") || path_str.ends_with(".bgz")
53}
54
55/// Parse a FASTA file and extract contig names and lengths.
56///
57/// This reads through the entire FASTA file to determine sequence lengths.
58/// For large files, consider using an existing .fai index instead.
59///
60/// # Errors
61///
62/// Returns `ParseError::Io` if the file cannot be read, `ParseError::Noodles` if
63/// parsing fails, `ParseError::InvalidFormat` if no contigs are found, or
64/// `ParseError::TooManyContigs` if the limit is exceeded.
65pub fn parse_fasta_file(path: &Path) -> Result<QueryHeader, ParseError> {
66    if is_gzipped(path) {
67        parse_fasta_gzipped(path)
68    } else {
69        parse_fasta_uncompressed(path)
70    }
71}
72
73/// Parse a FASTA file and compute MD5 checksums for each sequence.
74///
75/// This reads through the entire FASTA file and computes MD5 on the
76/// uppercase sequence (standard for sequence checksums).
77/// This is slower than `parse_fasta_file` but provides MD5 for matching.
78///
79/// # Errors
80///
81/// Returns `ParseError::Io` if the file cannot be read, `ParseError::Noodles` if
82/// parsing fails, `ParseError::InvalidFormat` if no contigs are found, or
83/// `ParseError::TooManyContigs` if the limit is exceeded.
84pub fn parse_fasta_file_with_md5(path: &Path) -> Result<QueryHeader, ParseError> {
85    if is_gzipped(path) {
86        parse_fasta_gzipped_with_md5(path)
87    } else {
88        parse_fasta_uncompressed_with_md5(path)
89    }
90}
91
92/// Parse an uncompressed FASTA file with MD5 computation
93fn parse_fasta_uncompressed_with_md5(path: &Path) -> Result<QueryHeader, ParseError> {
94    let file = std::fs::File::open(path)?;
95    let reader = BufReader::new(file);
96    let mut fasta_reader = fasta::io::Reader::new(reader);
97
98    parse_fasta_reader_with_md5(&mut fasta_reader)
99}
100
101/// Parse a gzip-compressed FASTA file with MD5 computation
102fn parse_fasta_gzipped_with_md5(path: &Path) -> Result<QueryHeader, ParseError> {
103    let file = std::fs::File::open(path)?;
104    let decoder = GzDecoder::new(file);
105    let reader = BufReader::new(decoder);
106    let mut fasta_reader = fasta::io::Reader::new(reader);
107
108    parse_fasta_reader_with_md5(&mut fasta_reader)
109}
110
111/// Parse from a noodles FASTA reader with MD5 computation
112fn parse_fasta_reader_with_md5<R: BufRead>(
113    reader: &mut fasta::io::Reader<R>,
114) -> Result<QueryHeader, ParseError> {
115    let mut contigs = Vec::new();
116
117    for result in reader.records() {
118        let record = result
119            .map_err(|e| ParseError::Noodles(format!("Failed to parse FASTA record: {e}")))?;
120
121        // Check contig limit for DOS protection
122        if check_contig_limit(contigs.len()).is_some() {
123            return Err(ParseError::TooManyContigs(contigs.len()));
124        }
125
126        let name = String::from_utf8_lossy(record.name()).to_string();
127        let sequence = record.sequence();
128        let length = sequence.len() as u64;
129
130        // Compute MD5 on uppercase sequence (standard convention)
131        let uppercase: Vec<u8> = sequence
132            .as_ref()
133            .iter()
134            .map(u8::to_ascii_uppercase)
135            .collect();
136        let md5 = format!("{:x}", md5::compute(&uppercase));
137
138        let mut contig = Contig::new(name, length);
139        contig.md5 = Some(md5);
140        contigs.push(contig);
141    }
142
143    if contigs.is_empty() {
144        return Err(ParseError::InvalidFormat(
145            "No sequences found in FASTA file".to_string(),
146        ));
147    }
148
149    Ok(QueryHeader::new(contigs))
150}
151
152/// Parse an uncompressed FASTA file
153fn parse_fasta_uncompressed(path: &Path) -> Result<QueryHeader, ParseError> {
154    let file = std::fs::File::open(path)?;
155    let reader = BufReader::new(file);
156    let mut fasta_reader = fasta::io::Reader::new(reader);
157
158    parse_fasta_reader(&mut fasta_reader)
159}
160
161/// Parse a gzip-compressed FASTA file
162fn parse_fasta_gzipped(path: &Path) -> Result<QueryHeader, ParseError> {
163    let file = std::fs::File::open(path)?;
164    let decoder = GzDecoder::new(file);
165    let reader = BufReader::new(decoder);
166    let mut fasta_reader = fasta::io::Reader::new(reader);
167
168    parse_fasta_reader(&mut fasta_reader)
169}
170
171/// Parse from a noodles FASTA reader
172fn parse_fasta_reader<R: BufRead>(
173    reader: &mut fasta::io::Reader<R>,
174) -> Result<QueryHeader, ParseError> {
175    let mut contigs = Vec::new();
176
177    for result in reader.records() {
178        let record = result
179            .map_err(|e| ParseError::Noodles(format!("Failed to parse FASTA record: {e}")))?;
180
181        // Check contig limit for DOS protection
182        if check_contig_limit(contigs.len()).is_some() {
183            return Err(ParseError::TooManyContigs(contigs.len()));
184        }
185
186        let name = String::from_utf8_lossy(record.name()).to_string();
187        let length = record.sequence().len() as u64;
188
189        contigs.push(Contig::new(name, length));
190    }
191
192    if contigs.is_empty() {
193        return Err(ParseError::InvalidFormat(
194            "No sequences found in FASTA file".to_string(),
195        ));
196    }
197
198    Ok(QueryHeader::new(contigs))
199}
200
201#[cfg(test)]
202mod tests {
203    use super::*;
204    use std::io::Write;
205    use tempfile::NamedTempFile;
206
207    #[test]
208    fn test_is_fasta_file() {
209        assert!(is_fasta_file(Path::new("test.fa")));
210        assert!(is_fasta_file(Path::new("test.fasta")));
211        assert!(is_fasta_file(Path::new("test.fna")));
212        assert!(is_fasta_file(Path::new("test.fa.gz")));
213        assert!(is_fasta_file(Path::new("test.fasta.gz")));
214        assert!(is_fasta_file(Path::new("test.fna.bgz")));
215        assert!(is_fasta_file(Path::new("/path/to/Reference.FA")));
216
217        assert!(!is_fasta_file(Path::new("test.bam")));
218        assert!(!is_fasta_file(Path::new("test.sam")));
219        assert!(!is_fasta_file(Path::new("test.fai")));
220    }
221
222    #[test]
223    fn test_parse_fasta_file() {
224        let fasta_content = b">chr1 description\nACGTACGT\nACGT\n>chr2\nGGGG\n";
225
226        let mut temp = NamedTempFile::with_suffix(".fa").unwrap();
227        temp.write_all(fasta_content).unwrap();
228        temp.flush().unwrap();
229
230        let query = parse_fasta_file(temp.path()).unwrap();
231        assert_eq!(query.contigs.len(), 2);
232        assert_eq!(query.contigs[0].name, "chr1");
233        assert_eq!(query.contigs[0].length, 12); // 8 + 4 bases
234        assert_eq!(query.contigs[1].name, "chr2");
235        assert_eq!(query.contigs[1].length, 4);
236    }
237
238    #[test]
239    fn test_parse_empty_fasta() {
240        let mut temp = NamedTempFile::with_suffix(".fa").unwrap();
241        temp.write_all(b"").unwrap();
242        temp.flush().unwrap();
243
244        let result = parse_fasta_file(temp.path());
245        assert!(result.is_err());
246    }
247
248    #[test]
249    fn test_parse_fasta_with_md5() {
250        // Simple sequence to verify MD5 computation
251        // "ACGT" uppercase -> MD5 = f1f8f4bf413b16ad135722aa4591043e
252        let fasta_content = b">chr1\nACGT\n";
253
254        let mut temp = NamedTempFile::with_suffix(".fa").unwrap();
255        temp.write_all(fasta_content).unwrap();
256        temp.flush().unwrap();
257
258        let query = parse_fasta_file_with_md5(temp.path()).unwrap();
259        assert_eq!(query.contigs.len(), 1);
260        assert_eq!(query.contigs[0].name, "chr1");
261        assert_eq!(query.contigs[0].length, 4);
262        assert_eq!(
263            query.contigs[0].md5,
264            Some("f1f8f4bf413b16ad135722aa4591043e".to_string())
265        );
266    }
267
268    #[test]
269    fn test_parse_fasta_with_md5_lowercase() {
270        // MD5 should be computed on uppercase, so "acgt" should give same result as "ACGT"
271        let fasta_content = b">chr1\nacgt\n";
272
273        let mut temp = NamedTempFile::with_suffix(".fa").unwrap();
274        temp.write_all(fasta_content).unwrap();
275        temp.flush().unwrap();
276
277        let query = parse_fasta_file_with_md5(temp.path()).unwrap();
278        assert_eq!(
279            query.contigs[0].md5,
280            Some("f1f8f4bf413b16ad135722aa4591043e".to_string())
281        );
282    }
283}