Skip to main content

ref_solver/parsing/
fasta.rs

1//! Parser for FASTA files using noodles.
2//!
3//! Extracts contig names and lengths from FASTA files.
4//! Supports both uncompressed and gzip/bgzip compressed files.
5//!
6//! Supported extensions:
7//! - `.fa`, `.fasta`, `.fna` (uncompressed)
8//! - `.fa.gz`, `.fasta.gz`, `.fna.gz` (gzip compressed)
9//! - `.fa.bgz`, `.fasta.bgz`, `.fna.bgz` (bgzip compressed)
10
11use std::ffi::OsStr;
12use std::io::{BufRead, BufReader};
13use std::path::Path;
14
15use flate2::read::GzDecoder;
16use noodles::fasta;
17
18use crate::core::contig::Contig;
19use crate::core::header::QueryHeader;
20use crate::parsing::sam::ParseError;
21use crate::utils::validation::{check_contig_limit, compute_sha512t24u, is_valid_sha512t24u};
22
23/// Check if the path has a FASTA extension
24pub fn is_fasta_file(path: &Path) -> bool {
25    let path_str = path.to_string_lossy().to_lowercase();
26
27    // Check for gzipped FASTA
28    if path_str.ends_with(".fa.gz")
29        || path_str.ends_with(".fasta.gz")
30        || path_str.ends_with(".fna.gz")
31        || path_str.ends_with(".fa.bgz")
32        || path_str.ends_with(".fasta.bgz")
33        || path_str.ends_with(".fna.bgz")
34    {
35        return true;
36    }
37
38    // Check for uncompressed FASTA
39    matches!(
40        path.extension()
41            .and_then(OsStr::to_str)
42            .map(str::to_lowercase)
43            .as_deref(),
44        Some("fa" | "fasta" | "fna")
45    )
46}
47
48/// Check if the path is a gzipped file
49#[allow(clippy::case_sensitive_file_extension_comparisons)] // Already lowercased
50fn is_gzipped(path: &Path) -> bool {
51    let path_str = path.to_string_lossy().to_lowercase();
52    path_str.ends_with(".gz") || path_str.ends_with(".bgz")
53}
54
55/// Parse a FASTA file and extract contig names and lengths.
56///
57/// This reads through the entire FASTA file to determine sequence lengths.
58/// For large files, consider using an existing .fai index instead.
59///
60/// # Errors
61///
62/// Returns `ParseError::Io` if the file cannot be read, `ParseError::Noodles` if
63/// parsing fails, `ParseError::InvalidFormat` if no contigs are found, or
64/// `ParseError::TooManyContigs` if the limit is exceeded.
65pub fn parse_fasta_file(path: &Path) -> Result<QueryHeader, ParseError> {
66    if is_gzipped(path) {
67        parse_fasta_gzipped(path)
68    } else {
69        parse_fasta_uncompressed(path)
70    }
71}
72
73/// Parse a FASTA file and compute MD5 checksums for each sequence.
74///
75/// This reads through the entire FASTA file and computes MD5 on the
76/// uppercase sequence (standard for sequence checksums).
77/// This is slower than `parse_fasta_file` but provides MD5 for matching.
78///
79/// # Errors
80///
81/// Returns `ParseError::Io` if the file cannot be read, `ParseError::Noodles` if
82/// parsing fails, `ParseError::InvalidFormat` if no contigs are found, or
83/// `ParseError::TooManyContigs` if the limit is exceeded.
84pub fn parse_fasta_file_with_md5(path: &Path) -> Result<QueryHeader, ParseError> {
85    if is_gzipped(path) {
86        parse_fasta_gzipped_with_md5(path)
87    } else {
88        parse_fasta_uncompressed_with_md5(path)
89    }
90}
91
92/// Parse an uncompressed FASTA file with MD5 computation
93fn parse_fasta_uncompressed_with_md5(path: &Path) -> Result<QueryHeader, ParseError> {
94    let file = std::fs::File::open(path)?;
95    let reader = BufReader::new(file);
96    let mut fasta_reader = fasta::io::Reader::new(reader);
97
98    parse_fasta_reader_with_md5(&mut fasta_reader)
99}
100
101/// Parse a gzip-compressed FASTA file with MD5 computation
102fn parse_fasta_gzipped_with_md5(path: &Path) -> Result<QueryHeader, ParseError> {
103    let file = std::fs::File::open(path)?;
104    let decoder = GzDecoder::new(file);
105    let reader = BufReader::new(decoder);
106    let mut fasta_reader = fasta::io::Reader::new(reader);
107
108    parse_fasta_reader_with_md5(&mut fasta_reader)
109}
110
111/// Parse from a noodles FASTA reader with MD5 computation
112fn parse_fasta_reader_with_md5<R: BufRead>(
113    reader: &mut fasta::io::Reader<R>,
114) -> Result<QueryHeader, ParseError> {
115    let mut contigs = Vec::new();
116
117    for result in reader.records() {
118        let record = result
119            .map_err(|e| ParseError::Noodles(format!("Failed to parse FASTA record: {e}")))?;
120
121        // Check contig limit for DOS protection
122        if check_contig_limit(contigs.len()).is_some() {
123            return Err(ParseError::TooManyContigs(contigs.len()));
124        }
125
126        let name = String::from_utf8_lossy(record.name()).to_string();
127        let sequence = record.sequence();
128        let length = sequence.len() as u64;
129
130        // Compute digests on uppercase sequence (standard convention)
131        let uppercase: Vec<u8> = sequence
132            .as_ref()
133            .iter()
134            .map(u8::to_ascii_uppercase)
135            .collect();
136        let md5 = format!("{:x}", md5::compute(&uppercase));
137        let sha512t24u = compute_sha512t24u(&uppercase);
138        debug_assert!(is_valid_sha512t24u(&sha512t24u));
139
140        let mut contig = Contig::new(name, length);
141        contig.md5 = Some(md5);
142        contig.sha512t24u = Some(sha512t24u);
143        contigs.push(contig);
144    }
145
146    if contigs.is_empty() {
147        return Err(ParseError::InvalidFormat(
148            "No sequences found in FASTA file".to_string(),
149        ));
150    }
151
152    Ok(QueryHeader::new(contigs))
153}
154
155/// Parse an uncompressed FASTA file
156fn parse_fasta_uncompressed(path: &Path) -> Result<QueryHeader, ParseError> {
157    let file = std::fs::File::open(path)?;
158    let reader = BufReader::new(file);
159    let mut fasta_reader = fasta::io::Reader::new(reader);
160
161    parse_fasta_reader(&mut fasta_reader)
162}
163
164/// Parse a gzip-compressed FASTA file
165fn parse_fasta_gzipped(path: &Path) -> Result<QueryHeader, ParseError> {
166    let file = std::fs::File::open(path)?;
167    let decoder = GzDecoder::new(file);
168    let reader = BufReader::new(decoder);
169    let mut fasta_reader = fasta::io::Reader::new(reader);
170
171    parse_fasta_reader(&mut fasta_reader)
172}
173
174/// Parse from a noodles FASTA reader
175fn parse_fasta_reader<R: BufRead>(
176    reader: &mut fasta::io::Reader<R>,
177) -> Result<QueryHeader, ParseError> {
178    let mut contigs = Vec::new();
179
180    for result in reader.records() {
181        let record = result
182            .map_err(|e| ParseError::Noodles(format!("Failed to parse FASTA record: {e}")))?;
183
184        // Check contig limit for DOS protection
185        if check_contig_limit(contigs.len()).is_some() {
186            return Err(ParseError::TooManyContigs(contigs.len()));
187        }
188
189        let name = String::from_utf8_lossy(record.name()).to_string();
190        let length = record.sequence().len() as u64;
191
192        contigs.push(Contig::new(name, length));
193    }
194
195    if contigs.is_empty() {
196        return Err(ParseError::InvalidFormat(
197            "No sequences found in FASTA file".to_string(),
198        ));
199    }
200
201    Ok(QueryHeader::new(contigs))
202}
203
204#[cfg(test)]
205mod tests {
206    use super::*;
207    use std::io::Write;
208    use tempfile::NamedTempFile;
209
210    #[test]
211    fn test_is_fasta_file() {
212        assert!(is_fasta_file(Path::new("test.fa")));
213        assert!(is_fasta_file(Path::new("test.fasta")));
214        assert!(is_fasta_file(Path::new("test.fna")));
215        assert!(is_fasta_file(Path::new("test.fa.gz")));
216        assert!(is_fasta_file(Path::new("test.fasta.gz")));
217        assert!(is_fasta_file(Path::new("test.fna.bgz")));
218        assert!(is_fasta_file(Path::new("/path/to/Reference.FA")));
219
220        assert!(!is_fasta_file(Path::new("test.bam")));
221        assert!(!is_fasta_file(Path::new("test.sam")));
222        assert!(!is_fasta_file(Path::new("test.fai")));
223    }
224
225    #[test]
226    fn test_parse_fasta_file() {
227        let fasta_content = b">chr1 description\nACGTACGT\nACGT\n>chr2\nGGGG\n";
228
229        let mut temp = NamedTempFile::with_suffix(".fa").unwrap();
230        temp.write_all(fasta_content).unwrap();
231        temp.flush().unwrap();
232
233        let query = parse_fasta_file(temp.path()).unwrap();
234        assert_eq!(query.contigs.len(), 2);
235        assert_eq!(query.contigs[0].name, "chr1");
236        assert_eq!(query.contigs[0].length, 12); // 8 + 4 bases
237        assert_eq!(query.contigs[1].name, "chr2");
238        assert_eq!(query.contigs[1].length, 4);
239    }
240
241    #[test]
242    fn test_parse_empty_fasta() {
243        let mut temp = NamedTempFile::with_suffix(".fa").unwrap();
244        temp.write_all(b"").unwrap();
245        temp.flush().unwrap();
246
247        let result = parse_fasta_file(temp.path());
248        assert!(result.is_err());
249    }
250
251    #[test]
252    fn test_parse_fasta_with_md5() {
253        // Simple sequence to verify MD5 computation
254        // "ACGT" uppercase -> MD5 = f1f8f4bf413b16ad135722aa4591043e
255        let fasta_content = b">chr1\nACGT\n";
256
257        let mut temp = NamedTempFile::with_suffix(".fa").unwrap();
258        temp.write_all(fasta_content).unwrap();
259        temp.flush().unwrap();
260
261        let query = parse_fasta_file_with_md5(temp.path()).unwrap();
262        assert_eq!(query.contigs.len(), 1);
263        assert_eq!(query.contigs[0].name, "chr1");
264        assert_eq!(query.contigs[0].length, 4);
265        assert_eq!(
266            query.contigs[0].md5,
267            Some("f1f8f4bf413b16ad135722aa4591043e".to_string())
268        );
269    }
270
271    #[test]
272    fn test_parse_fasta_with_md5_lowercase() {
273        // MD5 should be computed on uppercase, so "acgt" should give same result as "ACGT"
274        let fasta_content = b">chr1\nacgt\n";
275
276        let mut temp = NamedTempFile::with_suffix(".fa").unwrap();
277        temp.write_all(fasta_content).unwrap();
278        temp.flush().unwrap();
279
280        let query = parse_fasta_file_with_md5(temp.path()).unwrap();
281        assert_eq!(
282            query.contigs[0].md5,
283            Some("f1f8f4bf413b16ad135722aa4591043e".to_string())
284        );
285    }
286}