1use crate::core::header::QueryHeader;
2use std::path::Path;
3
4#[derive(Debug, Clone, Copy, PartialEq)]
6pub enum FileFormat {
7 Sam,
9 Bam,
11 Cram,
13 Dict,
15 Vcf,
17 NcbiReport,
19 Tsv,
21 Fai,
23 Fasta,
25 Auto,
27}
28
29#[derive(Debug, PartialEq, thiserror::Error)]
31pub enum FormatError {
32 #[error("Unable to detect file format from content and filename")]
33 UnknownFormat,
34 #[error("File appears to be binary but cannot determine specific format")]
35 UnsupportedBinary,
36}
37
38#[derive(Debug, thiserror::Error)]
40pub enum ParseError {
41 #[error("Failed to parse {format:?} content: {message}")]
42 ParseFailed { format: FileFormat, message: String },
43 #[error("IO error: {0}")]
44 Io(#[from] std::io::Error),
45}
46
47impl FileFormat {
48 #[must_use]
50 #[allow(clippy::trivially_copy_pass_by_ref)] pub fn display_name(&self) -> &'static str {
52 match self {
53 FileFormat::Sam => "SAM/BAM Header",
54 FileFormat::Bam => "BAM File",
55 FileFormat::Cram => "CRAM File",
56 FileFormat::Dict => "Sequence Dictionary",
57 FileFormat::Vcf => "VCF File",
58 FileFormat::NcbiReport => "NCBI Assembly Report",
59 FileFormat::Tsv => "TSV/CSV Table",
60 FileFormat::Fai => "FASTA Index",
61 FileFormat::Fasta => "FASTA File",
62 FileFormat::Auto => "Auto-detect",
63 }
64 }
65}
66
67pub fn detect_format(content: &str, filename: Option<&str>) -> Result<FileFormat, FormatError> {
75 if let Some(name) = filename {
77 if let Some(format) = detect_format_from_filename(name) {
78 if matches!(format, FileFormat::Bam | FileFormat::Cram) {
80 return Ok(format);
81 }
82 if validate_format_content(content, &format) {
84 return Ok(format);
85 }
86 }
87 }
88
89 detect_format_from_content(content)
91}
92
93fn detect_format_from_filename(filename: &str) -> Option<FileFormat> {
95 let path = Path::new(filename);
96 let lower_name = filename.to_lowercase();
97
98 if lower_name.ends_with(".vcf.gz") {
100 return Some(FileFormat::Vcf);
101 }
102 if lower_name.ends_with(".fa.gz")
103 || lower_name.ends_with(".fasta.gz")
104 || lower_name.ends_with(".fna.gz")
105 || lower_name.ends_with(".fa.bgz")
106 || lower_name.ends_with(".fasta.bgz")
107 || lower_name.ends_with(".fna.bgz")
108 {
109 return Some(FileFormat::Fasta);
110 }
111
112 let extension = path.extension()?.to_str()?.to_lowercase();
113
114 match extension.as_str() {
115 "sam" => Some(FileFormat::Sam),
116 "bam" => Some(FileFormat::Bam),
117 "cram" => Some(FileFormat::Cram),
118 "dict" => Some(FileFormat::Dict),
119 "vcf" => Some(FileFormat::Vcf),
120 "fai" => Some(FileFormat::Fai),
121 "fa" | "fasta" | "fna" => Some(FileFormat::Fasta),
122 "tsv" | "csv" => Some(FileFormat::Tsv),
123 "txt" => {
124 if lower_name.contains("assembly") || lower_name.contains("report") {
126 Some(FileFormat::NcbiReport)
127 } else if lower_name.ends_with(".dict.txt") {
128 Some(FileFormat::Dict)
129 } else {
130 Some(FileFormat::Sam)
132 }
133 }
134 _ => None,
135 }
136}
137
138fn detect_format_from_content(content: &str) -> Result<FileFormat, FormatError> {
140 let content_trimmed = content.trim();
141
142 if content_trimmed.is_empty() {
144 return Err(FormatError::UnknownFormat);
145 }
146
147 if content
149 .chars()
150 .any(|c| c.is_control() && c != '\n' && c != '\r' && c != '\t')
151 {
152 return Err(FormatError::UnsupportedBinary);
153 }
154
155 let lines: Vec<&str> = content_trimmed.lines().take(20).collect(); if lines.iter().any(|line| line.starts_with("@HD\t"))
159 && lines.iter().any(|line| line.starts_with("@SQ\t"))
160 {
161 return Ok(FileFormat::Dict);
162 }
163
164 if lines.iter().any(|line| line.starts_with("@SQ\t")) {
166 return Ok(FileFormat::Sam);
167 }
168
169 if lines
171 .iter()
172 .any(|line| line.starts_with("##fileformat=VCF"))
173 || (lines.iter().any(|line| line.starts_with("##"))
174 && lines.iter().any(|line| line.starts_with("##contig=")))
175 {
176 return Ok(FileFormat::Vcf);
177 }
178
179 if lines.iter().any(|line| {
181 line.contains("Sequence-Name")
182 && line.contains("Sequence-Role")
183 && line.contains("Assigned-Molecule")
184 }) {
185 return Ok(FileFormat::NcbiReport);
186 }
187
188 if lines.len() > 1 {
190 let first_line_cols = lines[0].split('\t').count();
191 if first_line_cols > 2
192 && lines
193 .iter()
194 .take(5)
195 .all(|line| line.split('\t').count() == first_line_cols)
196 {
197 if lines[0].to_lowercase().contains("length")
199 || lines[0].to_lowercase().contains("size")
200 || lines[0].to_lowercase().contains("sequence")
201 {
202 return Ok(FileFormat::Tsv);
203 }
204 }
205 }
206
207 if lines.len() > 1 {
209 let first_line_cols = lines[0].split(',').count();
210 if first_line_cols > 2
211 && lines
212 .iter()
213 .take(5)
214 .all(|line| line.split(',').count() == first_line_cols)
215 && (lines[0].to_lowercase().contains("length")
216 || lines[0].to_lowercase().contains("size")
217 || lines[0].to_lowercase().contains("sequence"))
218 {
219 return Ok(FileFormat::Tsv);
220 }
221 }
222
223 if !lines.is_empty() {
226 let fai_lines: Vec<&&str> = lines
227 .iter()
228 .filter(|line| !line.is_empty() && !line.starts_with('#'))
229 .collect();
230
231 if !fai_lines.is_empty()
232 && fai_lines.iter().all(|line| {
233 let fields: Vec<&str> = line.split('\t').collect();
234 if fields.len() != 5 {
235 return false;
236 }
237 fields[1..].iter().all(|f| f.parse::<u64>().is_ok())
239 })
240 {
241 return Ok(FileFormat::Fai);
242 }
243 }
244
245 if lines.iter().any(|line| {
247 line.contains("chr")
248 || line.contains("scaffold")
249 || line.contains("contig")
250 || line.to_lowercase().contains("sequence")
251 || line.to_lowercase().contains("length")
252 }) {
253 return Ok(FileFormat::Sam);
254 }
255
256 Err(FormatError::UnknownFormat)
257}
258
259#[allow(clippy::trivially_copy_pass_by_ref)] fn validate_format_content(content: &str, format: &FileFormat) -> bool {
262 match format {
263 FileFormat::Sam => {
264 content.contains("@SQ") || content.contains("SN:") || content.contains("LN:")
265 }
266 FileFormat::Dict => content.contains("@HD") && content.contains("@SQ"),
267 FileFormat::Vcf => {
268 content.contains("##")
269 && (content.contains("##contig=") || content.contains("##fileformat=VCF"))
270 }
271 FileFormat::NcbiReport => {
272 content.contains("Sequence-Name") || content.contains("Sequence-Role")
273 }
274 FileFormat::Tsv => {
275 content.contains('\t')
276 && (content.to_lowercase().contains("length")
277 || content.to_lowercase().contains("sequence"))
278 }
279 FileFormat::Fai => {
280 let lines: Vec<&str> = content.lines().take(5).collect();
282 lines.iter().any(|line| {
283 let fields: Vec<&str> = line.split('\t').collect();
284 fields.len() == 5 && fields[1..].iter().all(|f| f.parse::<u64>().is_ok())
285 })
286 }
287 FileFormat::Bam | FileFormat::Cram | FileFormat::Fasta => {
288 false
290 }
291 FileFormat::Auto => true, }
293}
294
295pub fn parse_with_format(content: &str, format: FileFormat) -> Result<QueryHeader, ParseError> {
302 match format {
303 FileFormat::Sam => {
304 crate::parsing::sam::parse_header_text(content).map_err(|e| ParseError::ParseFailed {
305 format: FileFormat::Sam,
306 message: e.to_string(),
307 })
308 }
309 FileFormat::Dict => {
310 crate::parsing::dict::parse_dict_text(content).map_err(|e| ParseError::ParseFailed {
311 format: FileFormat::Dict,
312 message: e.to_string(),
313 })
314 }
315 FileFormat::Vcf => crate::parsing::vcf::parse_vcf_header_text(content).map_err(|e| {
316 ParseError::ParseFailed {
317 format: FileFormat::Vcf,
318 message: e.to_string(),
319 }
320 }),
321 FileFormat::NcbiReport => {
322 match crate::parsing::ncbi_report::parse_ncbi_report_text(content) {
324 Ok(entries) => {
325 let contigs = entries.into_iter().map(|entry| entry.to_contig()).collect();
326 Ok(crate::core::header::QueryHeader::new(contigs))
327 }
328 Err(e) => Err(ParseError::ParseFailed {
329 format: FileFormat::NcbiReport,
330 message: e.to_string(),
331 }),
332 }
333 }
334 FileFormat::Tsv => {
335 match crate::parsing::tsv::parse_tsv_text(content, '\t') {
337 Ok(query) => Ok(query),
338 Err(_) => crate::parsing::tsv::parse_tsv_text(content, ',').map_err(|e| {
339 ParseError::ParseFailed {
340 format: FileFormat::Tsv,
341 message: format!("Failed to parse as TSV or CSV: {e}"),
342 }
343 }),
344 }
345 }
346 FileFormat::Fai => {
347 crate::parsing::fai::parse_fai_text(content).map_err(|e| ParseError::ParseFailed {
348 format: FileFormat::Fai,
349 message: e.to_string(),
350 })
351 }
352 FileFormat::Bam => Err(ParseError::ParseFailed {
353 format: FileFormat::Bam,
354 message: "BAM files must be parsed as binary, not text".to_string(),
355 }),
356 FileFormat::Cram => Err(ParseError::ParseFailed {
357 format: FileFormat::Cram,
358 message: "CRAM files must be parsed as binary, not text".to_string(),
359 }),
360 FileFormat::Fasta => Err(ParseError::ParseFailed {
361 format: FileFormat::Fasta,
362 message: "FASTA files must be parsed as binary, not text".to_string(),
363 }),
364 FileFormat::Auto => {
365 let detected_format =
367 detect_format_from_content(content).map_err(|e| ParseError::ParseFailed {
368 format: FileFormat::Auto,
369 message: format!("Auto-detection failed: {e}"),
370 })?;
371
372 parse_with_format(content, detected_format)
373 }
374 }
375}
376
377pub fn parse_binary_file(
386 file_content: &[u8],
387 format: FileFormat,
388) -> Result<QueryHeader, ParseError> {
389 use std::io::Write;
390 use tempfile::NamedTempFile;
391
392 match format {
393 FileFormat::Bam | FileFormat::Cram => {
394 let file_extension = match format {
396 FileFormat::Bam => ".bam",
397 FileFormat::Cram => ".cram",
398 _ => ".bin",
399 };
400
401 let mut temp_file =
402 NamedTempFile::with_suffix(file_extension).map_err(ParseError::Io)?;
403
404 temp_file.write_all(file_content).map_err(ParseError::Io)?;
406
407 let result = crate::parsing::sam::parse_file(temp_file.path());
409
410 result.map_err(|_e| ParseError::ParseFailed {
412 format,
413 message: "Binary file parsing failed".to_string(), })
415 }
416 FileFormat::Fasta => {
417 let is_gzipped =
419 file_content.len() >= 2 && file_content[0] == 0x1f && file_content[1] == 0x8b;
420 let file_extension = if is_gzipped { ".fa.gz" } else { ".fa" };
421
422 let mut temp_file =
423 NamedTempFile::with_suffix(file_extension).map_err(ParseError::Io)?;
424
425 temp_file.write_all(file_content).map_err(ParseError::Io)?;
427
428 let result = crate::parsing::fasta::parse_fasta_file(temp_file.path());
430
431 result.map_err(|_e| ParseError::ParseFailed {
433 format,
434 message: "Binary file parsing failed".to_string(), })
436 }
437 _ => Err(ParseError::ParseFailed {
438 format,
439 message: "Format is not a binary file format".to_string(),
440 }),
441 }
442}
443
444#[cfg(test)]
445mod tests {
446 use super::*;
447
448 #[test]
449 fn test_filename_detection() {
450 assert_eq!(
451 detect_format_from_filename("test.sam"),
452 Some(FileFormat::Sam)
453 );
454 assert_eq!(
455 detect_format_from_filename("test.bam"),
456 Some(FileFormat::Bam)
457 );
458 assert_eq!(
459 detect_format_from_filename("test.dict"),
460 Some(FileFormat::Dict)
461 );
462 assert_eq!(
463 detect_format_from_filename("test.vcf"),
464 Some(FileFormat::Vcf)
465 );
466 assert_eq!(
467 detect_format_from_filename("test.vcf.gz"),
468 Some(FileFormat::Vcf)
469 );
470 assert_eq!(
471 detect_format_from_filename("assembly_report.txt"),
472 Some(FileFormat::NcbiReport)
473 );
474 assert_eq!(
475 detect_format_from_filename("reference.fai"),
476 Some(FileFormat::Fai)
477 );
478 assert_eq!(
479 detect_format_from_filename("reference.fa"),
480 Some(FileFormat::Fasta)
481 );
482 assert_eq!(
483 detect_format_from_filename("reference.fasta"),
484 Some(FileFormat::Fasta)
485 );
486 assert_eq!(
487 detect_format_from_filename("reference.fa.gz"),
488 Some(FileFormat::Fasta)
489 );
490 assert_eq!(
491 detect_format_from_filename("reference.fasta.gz"),
492 Some(FileFormat::Fasta)
493 );
494 assert_eq!(detect_format_from_filename("unknown.xyz"), None);
495 }
496
497 #[test]
498 fn test_sam_header_detection() {
499 let content = "@SQ\tSN:chr1\tLN:248956422\tM5:6aef897c3d6ff0c78aff06ac189178dd\n";
500 assert_eq!(detect_format_from_content(content), Ok(FileFormat::Sam));
501 }
502
503 #[test]
504 fn test_dict_detection() {
505 let content = "@HD\tVN:1.0\tSO:coordinate\n@SQ\tSN:chr1\tLN:248956422\tM5:abc123\n";
506 assert_eq!(detect_format_from_content(content), Ok(FileFormat::Dict));
507 }
508
509 #[test]
510 fn test_vcf_detection() {
511 let content = "##fileformat=VCFv4.2\n##contig=<ID=chr1,length=248956422>\n";
512 assert_eq!(detect_format_from_content(content), Ok(FileFormat::Vcf));
513 }
514
515 #[test]
516 fn test_ncbi_report_detection() {
517 let content =
518 "# Sequence-Name\tSequence-Role\tAssigned-Molecule\tAssigned-Molecule-Location/Type\n";
519 assert_eq!(
520 detect_format_from_content(content),
521 Ok(FileFormat::NcbiReport)
522 );
523 }
524
525 #[test]
526 fn test_fai_detection() {
527 let content = "chr1\t248956422\t112\t70\t71\nchr2\t242193529\t253404903\t70\t71\n";
528 assert_eq!(detect_format_from_content(content), Ok(FileFormat::Fai));
529 }
530
531 #[test]
532 fn test_fai_validation() {
533 assert!(validate_format_content(
534 "chr1\t248956422\t112\t70\t71",
535 &FileFormat::Fai
536 ));
537 assert!(!validate_format_content(
538 "chr1\t248956422\t112",
539 &FileFormat::Fai
540 ));
541 }
542
543 #[test]
544 fn test_format_validation() {
545 assert!(validate_format_content(
546 "@SQ\tSN:chr1\tLN:123",
547 &FileFormat::Sam
548 ));
549 assert!(!validate_format_content("random text", &FileFormat::Sam));
550
551 assert!(validate_format_content(
552 "##contig=<ID=chr1>",
553 &FileFormat::Vcf
554 ));
555 assert!(!validate_format_content("@SQ\tSN:chr1", &FileFormat::Vcf));
556 }
557
558 #[test]
559 fn test_combined_detection() {
560 let content = "@SQ\tSN:chr1\tLN:248956422\n";
561 assert_eq!(
562 detect_format(content, Some("test.sam")),
563 Ok(FileFormat::Sam)
564 );
565 assert_eq!(
566 detect_format(content, Some("test.dict")),
567 Ok(FileFormat::Sam)
568 ); assert_eq!(detect_format(content, None), Ok(FileFormat::Sam));
570 }
571}