1use crate::core::header::QueryHeader;
2use std::path::Path;
3
4#[derive(Debug, Clone, Copy, PartialEq)]
6pub enum FileFormat {
7 Sam,
9 Bam,
11 Cram,
13 Dict,
15 Vcf,
17 NcbiReport,
19 Tsv,
21 Fai,
23 Fasta,
25 Auto,
27}
28
29#[derive(Debug, PartialEq, thiserror::Error)]
31pub enum FormatError {
32 #[error("Unable to detect file format from content and filename")]
33 UnknownFormat,
34 #[error("File appears to be binary but cannot determine specific format")]
35 UnsupportedBinary,
36}
37
38#[derive(Debug, thiserror::Error)]
40pub enum ParseError {
41 #[error("Failed to parse {format:?} content: {message}")]
42 ParseFailed { format: FileFormat, message: String },
43 #[error("IO error: {0}")]
44 Io(#[from] std::io::Error),
45}
46
47impl FileFormat {
48 #[must_use]
50 #[allow(clippy::trivially_copy_pass_by_ref)] pub fn display_name(&self) -> &'static str {
52 match self {
53 FileFormat::Sam => "SAM/BAM Header",
54 FileFormat::Bam => "BAM File",
55 FileFormat::Cram => "CRAM File",
56 FileFormat::Dict => "Sequence Dictionary",
57 FileFormat::Vcf => "VCF File",
58 FileFormat::NcbiReport => "NCBI Assembly Report",
59 FileFormat::Tsv => "TSV/CSV Table",
60 FileFormat::Fai => "FASTA Index",
61 FileFormat::Fasta => "FASTA File",
62 FileFormat::Auto => "Auto-detect",
63 }
64 }
65}
66
67pub fn detect_format(content: &str, filename: Option<&str>) -> Result<FileFormat, FormatError> {
75 if let Some(name) = filename {
77 if let Some(format) = detect_format_from_filename(name) {
78 if matches!(format, FileFormat::Bam | FileFormat::Cram) {
80 return Ok(format);
81 }
82 if validate_format_content(content, &format) {
84 return Ok(format);
85 }
86 }
87 }
88
89 detect_format_from_content(content)
91}
92
93fn detect_format_from_filename(filename: &str) -> Option<FileFormat> {
95 let path = Path::new(filename);
96 let lower_name = filename.to_lowercase();
97
98 if lower_name.ends_with(".vcf.gz") {
100 return Some(FileFormat::Vcf);
101 }
102 if lower_name.ends_with(".fa.gz")
103 || lower_name.ends_with(".fasta.gz")
104 || lower_name.ends_with(".fna.gz")
105 || lower_name.ends_with(".fa.bgz")
106 || lower_name.ends_with(".fasta.bgz")
107 || lower_name.ends_with(".fna.bgz")
108 {
109 return Some(FileFormat::Fasta);
110 }
111
112 let extension = path.extension()?.to_str()?.to_lowercase();
113
114 match extension.as_str() {
115 "sam" => Some(FileFormat::Sam),
116 "bam" => Some(FileFormat::Bam),
117 "cram" => Some(FileFormat::Cram),
118 "dict" => Some(FileFormat::Dict),
119 "vcf" => Some(FileFormat::Vcf),
120 "fai" => Some(FileFormat::Fai),
121 "fa" | "fasta" | "fna" => Some(FileFormat::Fasta),
122 "tsv" | "csv" => Some(FileFormat::Tsv),
123 "txt" => {
124 if lower_name.contains("assembly") || lower_name.contains("report") {
126 Some(FileFormat::NcbiReport)
127 } else if lower_name.ends_with(".dict.txt") {
128 Some(FileFormat::Dict)
129 } else {
130 Some(FileFormat::Sam)
132 }
133 }
134 _ => None,
135 }
136}
137
138fn is_sam_record(line: &str, prefix: &str) -> bool {
142 line.starts_with(prefix)
143 && line
144 .as_bytes()
145 .get(prefix.len())
146 .is_some_and(|&b| b == b'\t' || b == b' ')
147}
148
149fn detect_format_from_content(content: &str) -> Result<FileFormat, FormatError> {
151 let content_trimmed = content.trim();
152
153 if content_trimmed.is_empty() {
155 return Err(FormatError::UnknownFormat);
156 }
157
158 if content
160 .chars()
161 .any(|c| c.is_control() && c != '\n' && c != '\r' && c != '\t')
162 {
163 return Err(FormatError::UnsupportedBinary);
164 }
165
166 let lines: Vec<&str> = content_trimmed.lines().take(20).collect(); if lines.iter().any(|line| is_sam_record(line, "@HD"))
170 && lines.iter().any(|line| is_sam_record(line, "@SQ"))
171 {
172 return Ok(FileFormat::Dict);
173 }
174
175 if lines.iter().any(|line| is_sam_record(line, "@SQ")) {
177 return Ok(FileFormat::Sam);
178 }
179
180 if lines
182 .iter()
183 .any(|line| line.starts_with("##fileformat=VCF"))
184 || (lines.iter().any(|line| line.starts_with("##"))
185 && lines.iter().any(|line| line.starts_with("##contig=")))
186 {
187 return Ok(FileFormat::Vcf);
188 }
189
190 if lines.iter().any(|line| {
192 line.contains("Sequence-Name")
193 && line.contains("Sequence-Role")
194 && line.contains("Assigned-Molecule")
195 }) {
196 return Ok(FileFormat::NcbiReport);
197 }
198
199 if lines.len() > 1 {
201 let first_line_cols = lines[0].split('\t').count();
202 if first_line_cols > 2
203 && lines
204 .iter()
205 .take(5)
206 .all(|line| line.split('\t').count() == first_line_cols)
207 {
208 if lines[0].to_lowercase().contains("length")
210 || lines[0].to_lowercase().contains("size")
211 || lines[0].to_lowercase().contains("sequence")
212 {
213 return Ok(FileFormat::Tsv);
214 }
215 }
216 }
217
218 if lines.len() > 1 {
220 let first_line_cols = lines[0].split(',').count();
221 if first_line_cols > 2
222 && lines
223 .iter()
224 .take(5)
225 .all(|line| line.split(',').count() == first_line_cols)
226 && (lines[0].to_lowercase().contains("length")
227 || lines[0].to_lowercase().contains("size")
228 || lines[0].to_lowercase().contains("sequence"))
229 {
230 return Ok(FileFormat::Tsv);
231 }
232 }
233
234 if !lines.is_empty() {
237 let fai_lines: Vec<&&str> = lines
238 .iter()
239 .filter(|line| !line.is_empty() && !line.starts_with('#'))
240 .collect();
241
242 if !fai_lines.is_empty()
243 && fai_lines.iter().all(|line| {
244 let fields: Vec<&str> = line.split('\t').collect();
245 if fields.len() != 5 {
246 return false;
247 }
248 fields[1..].iter().all(|f| f.parse::<u64>().is_ok())
250 })
251 {
252 return Ok(FileFormat::Fai);
253 }
254 }
255
256 if lines.iter().any(|line| {
258 line.contains("chr")
259 || line.contains("scaffold")
260 || line.contains("contig")
261 || line.to_lowercase().contains("sequence")
262 || line.to_lowercase().contains("length")
263 }) {
264 return Ok(FileFormat::Sam);
265 }
266
267 Err(FormatError::UnknownFormat)
268}
269
270#[allow(clippy::trivially_copy_pass_by_ref)] fn validate_format_content(content: &str, format: &FileFormat) -> bool {
273 match format {
274 FileFormat::Sam => {
275 content.contains("@SQ") || content.contains("SN:") || content.contains("LN:")
276 }
277 FileFormat::Dict => content.contains("@HD") && content.contains("@SQ"),
278 FileFormat::Vcf => {
279 content.contains("##")
280 && (content.contains("##contig=") || content.contains("##fileformat=VCF"))
281 }
282 FileFormat::NcbiReport => {
283 content.contains("Sequence-Name") || content.contains("Sequence-Role")
284 }
285 FileFormat::Tsv => {
286 content.contains('\t')
287 && (content.to_lowercase().contains("length")
288 || content.to_lowercase().contains("sequence"))
289 }
290 FileFormat::Fai => {
291 let lines: Vec<&str> = content.lines().take(5).collect();
293 lines.iter().any(|line| {
294 let fields: Vec<&str> = line.split('\t').collect();
295 fields.len() == 5 && fields[1..].iter().all(|f| f.parse::<u64>().is_ok())
296 })
297 }
298 FileFormat::Bam | FileFormat::Cram | FileFormat::Fasta => {
299 false
301 }
302 FileFormat::Auto => true, }
304}
305
306pub fn parse_with_format(content: &str, format: FileFormat) -> Result<QueryHeader, ParseError> {
313 match format {
314 FileFormat::Sam => {
315 crate::parsing::sam::parse_header_text(content).map_err(|e| ParseError::ParseFailed {
316 format: FileFormat::Sam,
317 message: e.to_string(),
318 })
319 }
320 FileFormat::Dict => {
321 crate::parsing::dict::parse_dict_text(content).map_err(|e| ParseError::ParseFailed {
322 format: FileFormat::Dict,
323 message: e.to_string(),
324 })
325 }
326 FileFormat::Vcf => crate::parsing::vcf::parse_vcf_header_text(content).map_err(|e| {
327 ParseError::ParseFailed {
328 format: FileFormat::Vcf,
329 message: e.to_string(),
330 }
331 }),
332 FileFormat::NcbiReport => {
333 match crate::parsing::ncbi_report::parse_ncbi_report_text(content) {
335 Ok(entries) => {
336 let contigs = entries.into_iter().map(|entry| entry.to_contig()).collect();
337 Ok(crate::core::header::QueryHeader::new(contigs))
338 }
339 Err(e) => Err(ParseError::ParseFailed {
340 format: FileFormat::NcbiReport,
341 message: e.to_string(),
342 }),
343 }
344 }
345 FileFormat::Tsv => {
346 match crate::parsing::tsv::parse_tsv_text(content, '\t') {
348 Ok(query) => Ok(query),
349 Err(_) => crate::parsing::tsv::parse_tsv_text(content, ',').map_err(|e| {
350 ParseError::ParseFailed {
351 format: FileFormat::Tsv,
352 message: format!("Failed to parse as TSV or CSV: {e}"),
353 }
354 }),
355 }
356 }
357 FileFormat::Fai => {
358 crate::parsing::fai::parse_fai_text(content).map_err(|e| ParseError::ParseFailed {
359 format: FileFormat::Fai,
360 message: e.to_string(),
361 })
362 }
363 FileFormat::Bam => Err(ParseError::ParseFailed {
364 format: FileFormat::Bam,
365 message: "BAM files must be parsed as binary, not text".to_string(),
366 }),
367 FileFormat::Cram => Err(ParseError::ParseFailed {
368 format: FileFormat::Cram,
369 message: "CRAM files must be parsed as binary, not text".to_string(),
370 }),
371 FileFormat::Fasta => Err(ParseError::ParseFailed {
372 format: FileFormat::Fasta,
373 message: "FASTA files must be parsed as binary, not text".to_string(),
374 }),
375 FileFormat::Auto => {
376 let detected_format =
378 detect_format_from_content(content).map_err(|e| ParseError::ParseFailed {
379 format: FileFormat::Auto,
380 message: format!("Auto-detection failed: {e}"),
381 })?;
382
383 parse_with_format(content, detected_format)
384 }
385 }
386}
387
388pub fn parse_binary_file(
399 file_content: &[u8],
400 format: FileFormat,
401) -> Result<QueryHeader, ParseError> {
402 match format {
403 FileFormat::Bam => {
404 let cursor = std::io::Cursor::new(file_content);
405 crate::parsing::sam::parse_bam_from_reader(cursor).map_err(|e| {
406 ParseError::ParseFailed {
407 format,
408 message: format!("BAM file parsing failed: {e}"),
409 }
410 })
411 }
412 FileFormat::Cram => {
413 let cursor = std::io::Cursor::new(file_content);
414 crate::parsing::sam::parse_cram_from_reader(cursor).map_err(|e| {
415 ParseError::ParseFailed {
416 format,
417 message: format!("CRAM file parsing failed: {e}"),
418 }
419 })
420 }
421 FileFormat::Fasta => {
422 use std::io::Write;
424 use tempfile::NamedTempFile;
425
426 let is_gzipped =
427 file_content.len() >= 2 && file_content[0] == 0x1f && file_content[1] == 0x8b;
428 let file_extension = if is_gzipped { ".fa.gz" } else { ".fa" };
429
430 let mut temp_file =
431 NamedTempFile::with_suffix(file_extension).map_err(ParseError::Io)?;
432 temp_file.write_all(file_content).map_err(ParseError::Io)?;
433
434 let result = crate::parsing::fasta::parse_fasta_file(temp_file.path());
435 result.map_err(|e| ParseError::ParseFailed {
436 format,
437 message: format!("FASTA file parsing failed: {e}"),
438 })
439 }
440 _ => Err(ParseError::ParseFailed {
441 format,
442 message: "Format is not a binary file format".to_string(),
443 }),
444 }
445}
446
447pub fn parse_binary_file_from_path(
458 path: &std::path::Path,
459 format: FileFormat,
460) -> Result<QueryHeader, ParseError> {
461 match format {
462 FileFormat::Bam | FileFormat::Cram => {
463 crate::parsing::sam::parse_file(path).map_err(|e| ParseError::ParseFailed {
464 format,
465 message: format!("Binary file parsing failed: {e}"),
466 })
467 }
468 FileFormat::Fasta => {
469 crate::parsing::fasta::parse_fasta_file(path).map_err(|e| ParseError::ParseFailed {
470 format,
471 message: format!("FASTA file parsing failed: {e}"),
472 })
473 }
474 _ => Err(ParseError::ParseFailed {
475 format,
476 message: "Format is not a binary file format".to_string(),
477 }),
478 }
479}
480
481#[cfg(test)]
482mod tests {
483 use super::*;
484
485 #[test]
486 fn test_filename_detection() {
487 assert_eq!(
488 detect_format_from_filename("test.sam"),
489 Some(FileFormat::Sam)
490 );
491 assert_eq!(
492 detect_format_from_filename("test.bam"),
493 Some(FileFormat::Bam)
494 );
495 assert_eq!(
496 detect_format_from_filename("test.dict"),
497 Some(FileFormat::Dict)
498 );
499 assert_eq!(
500 detect_format_from_filename("test.vcf"),
501 Some(FileFormat::Vcf)
502 );
503 assert_eq!(
504 detect_format_from_filename("test.vcf.gz"),
505 Some(FileFormat::Vcf)
506 );
507 assert_eq!(
508 detect_format_from_filename("assembly_report.txt"),
509 Some(FileFormat::NcbiReport)
510 );
511 assert_eq!(
512 detect_format_from_filename("reference.fai"),
513 Some(FileFormat::Fai)
514 );
515 assert_eq!(
516 detect_format_from_filename("reference.fa"),
517 Some(FileFormat::Fasta)
518 );
519 assert_eq!(
520 detect_format_from_filename("reference.fasta"),
521 Some(FileFormat::Fasta)
522 );
523 assert_eq!(
524 detect_format_from_filename("reference.fa.gz"),
525 Some(FileFormat::Fasta)
526 );
527 assert_eq!(
528 detect_format_from_filename("reference.fasta.gz"),
529 Some(FileFormat::Fasta)
530 );
531 assert_eq!(detect_format_from_filename("unknown.xyz"), None);
532 }
533
534 #[test]
535 fn test_sam_header_detection() {
536 let content = "@SQ\tSN:chr1\tLN:248956422\tM5:6aef897c3d6ff0c78aff06ac189178dd\n";
537 assert_eq!(detect_format_from_content(content), Ok(FileFormat::Sam));
538 }
539
540 #[test]
541 fn test_dict_detection() {
542 let content = "@HD\tVN:1.0\tSO:coordinate\n@SQ\tSN:chr1\tLN:248956422\tM5:abc123\n";
543 assert_eq!(detect_format_from_content(content), Ok(FileFormat::Dict));
544 }
545
546 #[test]
547 fn test_vcf_detection() {
548 let content = "##fileformat=VCFv4.2\n##contig=<ID=chr1,length=248956422>\n";
549 assert_eq!(detect_format_from_content(content), Ok(FileFormat::Vcf));
550 }
551
552 #[test]
553 fn test_ncbi_report_detection() {
554 let content =
555 "# Sequence-Name\tSequence-Role\tAssigned-Molecule\tAssigned-Molecule-Location/Type\n";
556 assert_eq!(
557 detect_format_from_content(content),
558 Ok(FileFormat::NcbiReport)
559 );
560 }
561
562 #[test]
563 fn test_fai_detection() {
564 let content = "chr1\t248956422\t112\t70\t71\nchr2\t242193529\t253404903\t70\t71\n";
565 assert_eq!(detect_format_from_content(content), Ok(FileFormat::Fai));
566 }
567
568 #[test]
569 fn test_fai_validation() {
570 assert!(validate_format_content(
571 "chr1\t248956422\t112\t70\t71",
572 &FileFormat::Fai
573 ));
574 assert!(!validate_format_content(
575 "chr1\t248956422\t112",
576 &FileFormat::Fai
577 ));
578 }
579
580 #[test]
581 fn test_format_validation() {
582 assert!(validate_format_content(
583 "@SQ\tSN:chr1\tLN:123",
584 &FileFormat::Sam
585 ));
586 assert!(!validate_format_content("random text", &FileFormat::Sam));
587
588 assert!(validate_format_content(
589 "##contig=<ID=chr1>",
590 &FileFormat::Vcf
591 ));
592 assert!(!validate_format_content("@SQ\tSN:chr1", &FileFormat::Vcf));
593 }
594
595 #[test]
596 fn test_sam_header_detection_with_spaces() {
597 let content = "@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd\n";
598 assert_eq!(detect_format_from_content(content), Ok(FileFormat::Sam));
599 }
600
601 #[test]
602 fn test_dict_detection_with_spaces() {
603 let content = "@HD VN:1.0 SO:coordinate\n@SQ SN:chr1 LN:248956422\n";
604 assert_eq!(detect_format_from_content(content), Ok(FileFormat::Dict));
605 }
606
607 #[test]
608 fn test_sam_validation_with_spaces() {
609 assert!(validate_format_content(
610 "@SQ SN:chr1 LN:123",
611 &FileFormat::Sam
612 ));
613 }
614
615 #[test]
616 fn test_combined_detection() {
617 let content = "@SQ\tSN:chr1\tLN:248956422\n";
618 assert_eq!(
619 detect_format(content, Some("test.sam")),
620 Ok(FileFormat::Sam)
621 );
622 assert_eq!(
623 detect_format(content, Some("test.dict")),
624 Ok(FileFormat::Sam)
625 ); assert_eq!(detect_format(content, None), Ok(FileFormat::Sam));
627 }
628
629 #[test]
630 fn test_parse_binary_file_bam_from_bytes() {
631 use noodles::bam;
632 use noodles::sam;
633 use noodles::sam::header::record::value::map::{Map, ReferenceSequence};
634 use std::num::NonZeroUsize;
635
636 let header = sam::Header::builder()
637 .add_reference_sequence(
638 "chr1",
639 Map::<ReferenceSequence>::new(NonZeroUsize::new(248_956_422).unwrap()),
640 )
641 .build();
642 let mut bam_bytes = Vec::new();
643 {
644 let mut writer = bam::io::Writer::new(&mut bam_bytes);
645 writer.write_header(&header).unwrap();
646 }
647
648 let query = parse_binary_file(&bam_bytes, FileFormat::Bam).unwrap();
649 assert_eq!(query.contigs.len(), 1);
650 assert_eq!(query.contigs[0].name, "chr1");
651 assert_eq!(query.contigs[0].length, 248_956_422);
652 }
653
654 fn build_bam_bytes(header_text: &str) -> Vec<u8> {
656 use noodles::bam;
657 use noodles::sam;
658
659 let mut reader = sam::io::Reader::new(header_text.as_bytes());
660 let header = reader.read_header().unwrap();
661
662 let mut buf = Vec::new();
663 {
664 let mut writer = bam::io::Writer::new(&mut buf);
665 writer.write_header(&header).unwrap();
666 }
667 buf
668 }
669
670 #[test]
671 fn test_parse_binary_file_from_path_bam() {
672 use std::io::Write;
673 use tempfile::NamedTempFile;
674
675 let bam_bytes = build_bam_bytes(
676 "@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n@SQ\tSN:chr2\tLN:242193529\n",
677 );
678
679 let mut temp = NamedTempFile::with_suffix(".bam").unwrap();
680 temp.write_all(&bam_bytes).unwrap();
681
682 let result = parse_binary_file_from_path(temp.path(), FileFormat::Bam);
683 assert!(result.is_ok());
684 let query = result.unwrap();
685 assert_eq!(query.contigs.len(), 2);
686 assert_eq!(query.contigs[0].name, "chr1");
687 assert_eq!(query.contigs[0].length, 248_956_422);
688 assert_eq!(query.contigs[1].name, "chr2");
689 assert_eq!(query.contigs[1].length, 242_193_529);
690 }
691
692 #[test]
693 fn test_parse_binary_file_from_path_truncated_bam() {
694 use std::io::Write;
698 use tempfile::NamedTempFile;
699
700 let mut bam_bytes = build_bam_bytes("@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n");
701
702 bam_bytes.extend_from_slice(&[0u8; 1024]);
704
705 let mut temp = NamedTempFile::with_suffix(".bam").unwrap();
706 temp.write_all(&bam_bytes).unwrap();
707
708 let result = parse_binary_file_from_path(temp.path(), FileFormat::Bam);
709 assert!(result.is_ok());
710 let query = result.unwrap();
711 assert_eq!(query.contigs.len(), 1);
712 assert_eq!(query.contigs[0].name, "chr1");
713 }
714
715 #[test]
716 fn test_parse_binary_file_from_path_unsupported_format() {
717 use std::io::Write;
718 use tempfile::NamedTempFile;
719
720 let mut temp = NamedTempFile::with_suffix(".txt").unwrap();
721 temp.write_all(b"not a binary file").unwrap();
722
723 let result = parse_binary_file_from_path(temp.path(), FileFormat::Sam);
724 assert!(result.is_err());
725 }
726}