ref_solver/parsing/
fasta.rs1use std::ffi::OsStr;
12use std::io::{BufRead, BufReader};
13use std::path::Path;
14
15use flate2::read::GzDecoder;
16use noodles::fasta;
17
18use crate::core::contig::Contig;
19use crate::core::header::QueryHeader;
20use crate::parsing::sam::ParseError;
21use crate::utils::validation::{check_contig_limit, compute_sha512t24u, is_valid_sha512t24u};
22
23pub fn is_fasta_file(path: &Path) -> bool {
25 let path_str = path.to_string_lossy().to_lowercase();
26
27 if path_str.ends_with(".fa.gz")
29 || path_str.ends_with(".fasta.gz")
30 || path_str.ends_with(".fna.gz")
31 || path_str.ends_with(".fa.bgz")
32 || path_str.ends_with(".fasta.bgz")
33 || path_str.ends_with(".fna.bgz")
34 {
35 return true;
36 }
37
38 matches!(
40 path.extension()
41 .and_then(OsStr::to_str)
42 .map(str::to_lowercase)
43 .as_deref(),
44 Some("fa" | "fasta" | "fna")
45 )
46}
47
48#[allow(clippy::case_sensitive_file_extension_comparisons)] fn is_gzipped(path: &Path) -> bool {
51 let path_str = path.to_string_lossy().to_lowercase();
52 path_str.ends_with(".gz") || path_str.ends_with(".bgz")
53}
54
55pub fn parse_fasta_file(path: &Path) -> Result<QueryHeader, ParseError> {
66 if is_gzipped(path) {
67 parse_fasta_gzipped(path)
68 } else {
69 parse_fasta_uncompressed(path)
70 }
71}
72
73pub fn parse_fasta_file_with_md5(path: &Path) -> Result<QueryHeader, ParseError> {
85 if is_gzipped(path) {
86 parse_fasta_gzipped_with_md5(path)
87 } else {
88 parse_fasta_uncompressed_with_md5(path)
89 }
90}
91
92fn parse_fasta_uncompressed_with_md5(path: &Path) -> Result<QueryHeader, ParseError> {
94 let file = std::fs::File::open(path)?;
95 let reader = BufReader::new(file);
96 let mut fasta_reader = fasta::io::Reader::new(reader);
97
98 parse_fasta_reader_with_md5(&mut fasta_reader)
99}
100
101fn parse_fasta_gzipped_with_md5(path: &Path) -> Result<QueryHeader, ParseError> {
103 let file = std::fs::File::open(path)?;
104 let decoder = GzDecoder::new(file);
105 let reader = BufReader::new(decoder);
106 let mut fasta_reader = fasta::io::Reader::new(reader);
107
108 parse_fasta_reader_with_md5(&mut fasta_reader)
109}
110
111fn parse_fasta_reader_with_md5<R: BufRead>(
113 reader: &mut fasta::io::Reader<R>,
114) -> Result<QueryHeader, ParseError> {
115 let mut contigs = Vec::new();
116
117 for result in reader.records() {
118 let record = result
119 .map_err(|e| ParseError::Noodles(format!("Failed to parse FASTA record: {e}")))?;
120
121 if check_contig_limit(contigs.len()).is_some() {
123 return Err(ParseError::TooManyContigs(contigs.len()));
124 }
125
126 let name = String::from_utf8_lossy(record.name()).to_string();
127 let sequence = record.sequence();
128 let length = sequence.len() as u64;
129
130 let uppercase: Vec<u8> = sequence
132 .as_ref()
133 .iter()
134 .map(u8::to_ascii_uppercase)
135 .collect();
136 let md5 = format!("{:x}", md5::compute(&uppercase));
137 let sha512t24u = compute_sha512t24u(&uppercase);
138 debug_assert!(is_valid_sha512t24u(&sha512t24u));
139
140 let mut contig = Contig::new(name, length);
141 contig.md5 = Some(md5);
142 contig.sha512t24u = Some(sha512t24u);
143 contigs.push(contig);
144 }
145
146 if contigs.is_empty() {
147 return Err(ParseError::InvalidFormat(
148 "No sequences found in FASTA file".to_string(),
149 ));
150 }
151
152 Ok(QueryHeader::new(contigs))
153}
154
155fn parse_fasta_uncompressed(path: &Path) -> Result<QueryHeader, ParseError> {
157 let file = std::fs::File::open(path)?;
158 let reader = BufReader::new(file);
159 let mut fasta_reader = fasta::io::Reader::new(reader);
160
161 parse_fasta_reader(&mut fasta_reader)
162}
163
164fn parse_fasta_gzipped(path: &Path) -> Result<QueryHeader, ParseError> {
166 let file = std::fs::File::open(path)?;
167 let decoder = GzDecoder::new(file);
168 let reader = BufReader::new(decoder);
169 let mut fasta_reader = fasta::io::Reader::new(reader);
170
171 parse_fasta_reader(&mut fasta_reader)
172}
173
174fn parse_fasta_reader<R: BufRead>(
176 reader: &mut fasta::io::Reader<R>,
177) -> Result<QueryHeader, ParseError> {
178 let mut contigs = Vec::new();
179
180 for result in reader.records() {
181 let record = result
182 .map_err(|e| ParseError::Noodles(format!("Failed to parse FASTA record: {e}")))?;
183
184 if check_contig_limit(contigs.len()).is_some() {
186 return Err(ParseError::TooManyContigs(contigs.len()));
187 }
188
189 let name = String::from_utf8_lossy(record.name()).to_string();
190 let length = record.sequence().len() as u64;
191
192 contigs.push(Contig::new(name, length));
193 }
194
195 if contigs.is_empty() {
196 return Err(ParseError::InvalidFormat(
197 "No sequences found in FASTA file".to_string(),
198 ));
199 }
200
201 Ok(QueryHeader::new(contigs))
202}
203
204#[cfg(test)]
205mod tests {
206 use super::*;
207 use std::io::Write;
208 use tempfile::NamedTempFile;
209
210 #[test]
211 fn test_is_fasta_file() {
212 assert!(is_fasta_file(Path::new("test.fa")));
213 assert!(is_fasta_file(Path::new("test.fasta")));
214 assert!(is_fasta_file(Path::new("test.fna")));
215 assert!(is_fasta_file(Path::new("test.fa.gz")));
216 assert!(is_fasta_file(Path::new("test.fasta.gz")));
217 assert!(is_fasta_file(Path::new("test.fna.bgz")));
218 assert!(is_fasta_file(Path::new("/path/to/Reference.FA")));
219
220 assert!(!is_fasta_file(Path::new("test.bam")));
221 assert!(!is_fasta_file(Path::new("test.sam")));
222 assert!(!is_fasta_file(Path::new("test.fai")));
223 }
224
225 #[test]
226 fn test_parse_fasta_file() {
227 let fasta_content = b">chr1 description\nACGTACGT\nACGT\n>chr2\nGGGG\n";
228
229 let mut temp = NamedTempFile::with_suffix(".fa").unwrap();
230 temp.write_all(fasta_content).unwrap();
231 temp.flush().unwrap();
232
233 let query = parse_fasta_file(temp.path()).unwrap();
234 assert_eq!(query.contigs.len(), 2);
235 assert_eq!(query.contigs[0].name, "chr1");
236 assert_eq!(query.contigs[0].length, 12); assert_eq!(query.contigs[1].name, "chr2");
238 assert_eq!(query.contigs[1].length, 4);
239 }
240
241 #[test]
242 fn test_parse_empty_fasta() {
243 let mut temp = NamedTempFile::with_suffix(".fa").unwrap();
244 temp.write_all(b"").unwrap();
245 temp.flush().unwrap();
246
247 let result = parse_fasta_file(temp.path());
248 assert!(result.is_err());
249 }
250
251 #[test]
252 fn test_parse_fasta_with_md5() {
253 let fasta_content = b">chr1\nACGT\n";
256
257 let mut temp = NamedTempFile::with_suffix(".fa").unwrap();
258 temp.write_all(fasta_content).unwrap();
259 temp.flush().unwrap();
260
261 let query = parse_fasta_file_with_md5(temp.path()).unwrap();
262 assert_eq!(query.contigs.len(), 1);
263 assert_eq!(query.contigs[0].name, "chr1");
264 assert_eq!(query.contigs[0].length, 4);
265 assert_eq!(
266 query.contigs[0].md5,
267 Some("f1f8f4bf413b16ad135722aa4591043e".to_string())
268 );
269 }
270
271 #[test]
272 fn test_parse_fasta_with_md5_lowercase() {
273 let fasta_content = b">chr1\nacgt\n";
275
276 let mut temp = NamedTempFile::with_suffix(".fa").unwrap();
277 temp.write_all(fasta_content).unwrap();
278 temp.flush().unwrap();
279
280 let query = parse_fasta_file_with_md5(temp.path()).unwrap();
281 assert_eq!(
282 query.contigs[0].md5,
283 Some("f1f8f4bf413b16ad135722aa4591043e".to_string())
284 );
285 }
286}