ref_solver/parsing/
fasta.rs1use std::ffi::OsStr;
12use std::io::{BufRead, BufReader};
13use std::path::Path;
14
15use flate2::read::GzDecoder;
16use noodles::fasta;
17
18use crate::core::contig::Contig;
19use crate::core::header::QueryHeader;
20use crate::parsing::sam::ParseError;
21use crate::utils::validation::check_contig_limit;
22
23pub fn is_fasta_file(path: &Path) -> bool {
25 let path_str = path.to_string_lossy().to_lowercase();
26
27 if path_str.ends_with(".fa.gz")
29 || path_str.ends_with(".fasta.gz")
30 || path_str.ends_with(".fna.gz")
31 || path_str.ends_with(".fa.bgz")
32 || path_str.ends_with(".fasta.bgz")
33 || path_str.ends_with(".fna.bgz")
34 {
35 return true;
36 }
37
38 matches!(
40 path.extension()
41 .and_then(OsStr::to_str)
42 .map(str::to_lowercase)
43 .as_deref(),
44 Some("fa" | "fasta" | "fna")
45 )
46}
47
48#[allow(clippy::case_sensitive_file_extension_comparisons)] fn is_gzipped(path: &Path) -> bool {
51 let path_str = path.to_string_lossy().to_lowercase();
52 path_str.ends_with(".gz") || path_str.ends_with(".bgz")
53}
54
55pub fn parse_fasta_file(path: &Path) -> Result<QueryHeader, ParseError> {
66 if is_gzipped(path) {
67 parse_fasta_gzipped(path)
68 } else {
69 parse_fasta_uncompressed(path)
70 }
71}
72
73pub fn parse_fasta_file_with_md5(path: &Path) -> Result<QueryHeader, ParseError> {
85 if is_gzipped(path) {
86 parse_fasta_gzipped_with_md5(path)
87 } else {
88 parse_fasta_uncompressed_with_md5(path)
89 }
90}
91
92fn parse_fasta_uncompressed_with_md5(path: &Path) -> Result<QueryHeader, ParseError> {
94 let file = std::fs::File::open(path)?;
95 let reader = BufReader::new(file);
96 let mut fasta_reader = fasta::io::Reader::new(reader);
97
98 parse_fasta_reader_with_md5(&mut fasta_reader)
99}
100
101fn parse_fasta_gzipped_with_md5(path: &Path) -> Result<QueryHeader, ParseError> {
103 let file = std::fs::File::open(path)?;
104 let decoder = GzDecoder::new(file);
105 let reader = BufReader::new(decoder);
106 let mut fasta_reader = fasta::io::Reader::new(reader);
107
108 parse_fasta_reader_with_md5(&mut fasta_reader)
109}
110
111fn parse_fasta_reader_with_md5<R: BufRead>(
113 reader: &mut fasta::io::Reader<R>,
114) -> Result<QueryHeader, ParseError> {
115 let mut contigs = Vec::new();
116
117 for result in reader.records() {
118 let record = result
119 .map_err(|e| ParseError::Noodles(format!("Failed to parse FASTA record: {e}")))?;
120
121 if check_contig_limit(contigs.len()).is_some() {
123 return Err(ParseError::TooManyContigs(contigs.len()));
124 }
125
126 let name = String::from_utf8_lossy(record.name()).to_string();
127 let sequence = record.sequence();
128 let length = sequence.len() as u64;
129
130 let uppercase: Vec<u8> = sequence
132 .as_ref()
133 .iter()
134 .map(u8::to_ascii_uppercase)
135 .collect();
136 let md5 = format!("{:x}", md5::compute(&uppercase));
137
138 let mut contig = Contig::new(name, length);
139 contig.md5 = Some(md5);
140 contigs.push(contig);
141 }
142
143 if contigs.is_empty() {
144 return Err(ParseError::InvalidFormat(
145 "No sequences found in FASTA file".to_string(),
146 ));
147 }
148
149 Ok(QueryHeader::new(contigs))
150}
151
152fn parse_fasta_uncompressed(path: &Path) -> Result<QueryHeader, ParseError> {
154 let file = std::fs::File::open(path)?;
155 let reader = BufReader::new(file);
156 let mut fasta_reader = fasta::io::Reader::new(reader);
157
158 parse_fasta_reader(&mut fasta_reader)
159}
160
161fn parse_fasta_gzipped(path: &Path) -> Result<QueryHeader, ParseError> {
163 let file = std::fs::File::open(path)?;
164 let decoder = GzDecoder::new(file);
165 let reader = BufReader::new(decoder);
166 let mut fasta_reader = fasta::io::Reader::new(reader);
167
168 parse_fasta_reader(&mut fasta_reader)
169}
170
171fn parse_fasta_reader<R: BufRead>(
173 reader: &mut fasta::io::Reader<R>,
174) -> Result<QueryHeader, ParseError> {
175 let mut contigs = Vec::new();
176
177 for result in reader.records() {
178 let record = result
179 .map_err(|e| ParseError::Noodles(format!("Failed to parse FASTA record: {e}")))?;
180
181 if check_contig_limit(contigs.len()).is_some() {
183 return Err(ParseError::TooManyContigs(contigs.len()));
184 }
185
186 let name = String::from_utf8_lossy(record.name()).to_string();
187 let length = record.sequence().len() as u64;
188
189 contigs.push(Contig::new(name, length));
190 }
191
192 if contigs.is_empty() {
193 return Err(ParseError::InvalidFormat(
194 "No sequences found in FASTA file".to_string(),
195 ));
196 }
197
198 Ok(QueryHeader::new(contigs))
199}
200
201#[cfg(test)]
202mod tests {
203 use super::*;
204 use std::io::Write;
205 use tempfile::NamedTempFile;
206
207 #[test]
208 fn test_is_fasta_file() {
209 assert!(is_fasta_file(Path::new("test.fa")));
210 assert!(is_fasta_file(Path::new("test.fasta")));
211 assert!(is_fasta_file(Path::new("test.fna")));
212 assert!(is_fasta_file(Path::new("test.fa.gz")));
213 assert!(is_fasta_file(Path::new("test.fasta.gz")));
214 assert!(is_fasta_file(Path::new("test.fna.bgz")));
215 assert!(is_fasta_file(Path::new("/path/to/Reference.FA")));
216
217 assert!(!is_fasta_file(Path::new("test.bam")));
218 assert!(!is_fasta_file(Path::new("test.sam")));
219 assert!(!is_fasta_file(Path::new("test.fai")));
220 }
221
222 #[test]
223 fn test_parse_fasta_file() {
224 let fasta_content = b">chr1 description\nACGTACGT\nACGT\n>chr2\nGGGG\n";
225
226 let mut temp = NamedTempFile::with_suffix(".fa").unwrap();
227 temp.write_all(fasta_content).unwrap();
228 temp.flush().unwrap();
229
230 let query = parse_fasta_file(temp.path()).unwrap();
231 assert_eq!(query.contigs.len(), 2);
232 assert_eq!(query.contigs[0].name, "chr1");
233 assert_eq!(query.contigs[0].length, 12); assert_eq!(query.contigs[1].name, "chr2");
235 assert_eq!(query.contigs[1].length, 4);
236 }
237
238 #[test]
239 fn test_parse_empty_fasta() {
240 let mut temp = NamedTempFile::with_suffix(".fa").unwrap();
241 temp.write_all(b"").unwrap();
242 temp.flush().unwrap();
243
244 let result = parse_fasta_file(temp.path());
245 assert!(result.is_err());
246 }
247
248 #[test]
249 fn test_parse_fasta_with_md5() {
250 let fasta_content = b">chr1\nACGT\n";
253
254 let mut temp = NamedTempFile::with_suffix(".fa").unwrap();
255 temp.write_all(fasta_content).unwrap();
256 temp.flush().unwrap();
257
258 let query = parse_fasta_file_with_md5(temp.path()).unwrap();
259 assert_eq!(query.contigs.len(), 1);
260 assert_eq!(query.contigs[0].name, "chr1");
261 assert_eq!(query.contigs[0].length, 4);
262 assert_eq!(
263 query.contigs[0].md5,
264 Some("f1f8f4bf413b16ad135722aa4591043e".to_string())
265 );
266 }
267
268 #[test]
269 fn test_parse_fasta_with_md5_lowercase() {
270 let fasta_content = b">chr1\nacgt\n";
272
273 let mut temp = NamedTempFile::with_suffix(".fa").unwrap();
274 temp.write_all(fasta_content).unwrap();
275 temp.flush().unwrap();
276
277 let query = parse_fasta_file_with_md5(temp.path()).unwrap();
278 assert_eq!(
279 query.contigs[0].md5,
280 Some("f1f8f4bf413b16ad135722aa4591043e".to_string())
281 );
282 }
283}