Skip to main content

ref_solver/web/
format_detection.rs

1use crate::core::header::QueryHeader;
2use std::path::Path;
3
4/// Supported file formats for reference identification
5#[derive(Debug, Clone, Copy, PartialEq)]
6pub enum FileFormat {
7    /// SAM/BAM/CRAM files or plain text headers
8    Sam,
9    /// BAM binary format
10    Bam,
11    /// CRAM binary format
12    Cram,
13    /// Picard sequence dictionary files
14    Dict,
15    /// VCF files with contig headers
16    Vcf,
17    /// NCBI assembly report files
18    NcbiReport,
19    /// TSV/CSV tabular files
20    Tsv,
21    /// FASTA index (.fai) files
22    Fai,
23    /// FASTA sequence files
24    Fasta,
25    /// Automatically detect format
26    Auto,
27}
28
29/// Errors that can occur during format detection
30#[derive(Debug, PartialEq, thiserror::Error)]
31pub enum FormatError {
32    #[error("Unable to detect file format from content and filename")]
33    UnknownFormat,
34    #[error("File appears to be binary but cannot determine specific format")]
35    UnsupportedBinary,
36}
37
38/// Errors that can occur during parsing
39#[derive(Debug, thiserror::Error)]
40pub enum ParseError {
41    #[error("Failed to parse {format:?} content: {message}")]
42    ParseFailed { format: FileFormat, message: String },
43    #[error("IO error: {0}")]
44    Io(#[from] std::io::Error),
45}
46
47impl FileFormat {
48    /// Get the display name for this format
49    #[must_use]
50    #[allow(clippy::trivially_copy_pass_by_ref)] // Idiomatic method signature
51    pub fn display_name(&self) -> &'static str {
52        match self {
53            FileFormat::Sam => "SAM/BAM Header",
54            FileFormat::Bam => "BAM File",
55            FileFormat::Cram => "CRAM File",
56            FileFormat::Dict => "Sequence Dictionary",
57            FileFormat::Vcf => "VCF File",
58            FileFormat::NcbiReport => "NCBI Assembly Report",
59            FileFormat::Tsv => "TSV/CSV Table",
60            FileFormat::Fai => "FASTA Index",
61            FileFormat::Fasta => "FASTA File",
62            FileFormat::Auto => "Auto-detect",
63        }
64    }
65}
66
67/// Detect file format from content and optional filename
68///
69/// # Errors
70///
71/// Returns `FormatError::UnknownFormat` if the format cannot be detected, or
72/// `FormatError::UnsupportedBinary` if the file appears to be binary but the
73/// specific format cannot be determined.
74pub fn detect_format(content: &str, filename: Option<&str>) -> Result<FileFormat, FormatError> {
75    // First try filename-based detection if available
76    if let Some(name) = filename {
77        if let Some(format) = detect_format_from_filename(name) {
78            // For binary formats, trust filename-based detection without content validation
79            if matches!(format, FileFormat::Bam | FileFormat::Cram) {
80                return Ok(format);
81            }
82            // For text formats, validate that content matches expected format
83            if validate_format_content(content, &format) {
84                return Ok(format);
85            }
86        }
87    }
88
89    // Fall back to content-based detection
90    detect_format_from_content(content)
91}
92
93/// Detect format based on filename and extension
94fn detect_format_from_filename(filename: &str) -> Option<FileFormat> {
95    let path = Path::new(filename);
96    let lower_name = filename.to_lowercase();
97
98    // Check for compressed formats first (multi-extension patterns)
99    if lower_name.ends_with(".vcf.gz") {
100        return Some(FileFormat::Vcf);
101    }
102    if lower_name.ends_with(".fa.gz")
103        || lower_name.ends_with(".fasta.gz")
104        || lower_name.ends_with(".fna.gz")
105        || lower_name.ends_with(".fa.bgz")
106        || lower_name.ends_with(".fasta.bgz")
107        || lower_name.ends_with(".fna.bgz")
108    {
109        return Some(FileFormat::Fasta);
110    }
111
112    let extension = path.extension()?.to_str()?.to_lowercase();
113
114    match extension.as_str() {
115        "sam" => Some(FileFormat::Sam),
116        "bam" => Some(FileFormat::Bam),
117        "cram" => Some(FileFormat::Cram),
118        "dict" => Some(FileFormat::Dict),
119        "vcf" => Some(FileFormat::Vcf),
120        "fai" => Some(FileFormat::Fai),
121        "fa" | "fasta" | "fna" => Some(FileFormat::Fasta),
122        "tsv" | "csv" => Some(FileFormat::Tsv),
123        "txt" => {
124            // Disambiguate .txt files based on filename patterns
125            if lower_name.contains("assembly") || lower_name.contains("report") {
126                Some(FileFormat::NcbiReport)
127            } else if lower_name.ends_with(".dict.txt") {
128                Some(FileFormat::Dict)
129            } else {
130                // Default to SAM for .txt files
131                Some(FileFormat::Sam)
132            }
133        }
134        _ => None,
135    }
136}
137
138/// Check if a line starts with a SAM record type prefix followed by a tab or space.
139///
140/// SAM headers use tabs as delimiters, but copy-pasted text often has spaces instead.
141fn is_sam_record(line: &str, prefix: &str) -> bool {
142    line.starts_with(prefix)
143        && line
144            .as_bytes()
145            .get(prefix.len())
146            .is_some_and(|&b| b == b'\t' || b == b' ')
147}
148
149/// Detect format from file content analysis
150fn detect_format_from_content(content: &str) -> Result<FileFormat, FormatError> {
151    let content_trimmed = content.trim();
152
153    // Check for empty content
154    if content_trimmed.is_empty() {
155        return Err(FormatError::UnknownFormat);
156    }
157
158    // Check for binary content (non-UTF8 or control characters)
159    if content
160        .chars()
161        .any(|c| c.is_control() && c != '\n' && c != '\r' && c != '\t')
162    {
163        return Err(FormatError::UnsupportedBinary);
164    }
165
166    let lines: Vec<&str> = content_trimmed.lines().take(20).collect(); // Sample first 20 lines
167
168    // Picard dictionary: starts with @HD and has @SQ lines (check BEFORE Sam)
169    if lines.iter().any(|line| is_sam_record(line, "@HD"))
170        && lines.iter().any(|line| is_sam_record(line, "@SQ"))
171    {
172        return Ok(FileFormat::Dict);
173    }
174
175    // SAM header format: starts with @SQ lines
176    if lines.iter().any(|line| is_sam_record(line, "@SQ")) {
177        return Ok(FileFormat::Sam);
178    }
179
180    // VCF format: starts with ## comments and has ##contig lines
181    if lines
182        .iter()
183        .any(|line| line.starts_with("##fileformat=VCF"))
184        || (lines.iter().any(|line| line.starts_with("##"))
185            && lines.iter().any(|line| line.starts_with("##contig=")))
186    {
187        return Ok(FileFormat::Vcf);
188    }
189
190    // NCBI assembly report: has specific column headers (check all lines for header)
191    if lines.iter().any(|line| {
192        line.contains("Sequence-Name")
193            && line.contains("Sequence-Role")
194            && line.contains("Assigned-Molecule")
195    }) {
196        return Ok(FileFormat::NcbiReport);
197    }
198
199    // TSV format: tab-separated with consistent column count
200    if lines.len() > 1 {
201        let first_line_cols = lines[0].split('\t').count();
202        if first_line_cols > 2
203            && lines
204                .iter()
205                .take(5)
206                .all(|line| line.split('\t').count() == first_line_cols)
207        {
208            // Check if it looks like sequence data (has length/size columns)
209            if lines[0].to_lowercase().contains("length")
210                || lines[0].to_lowercase().contains("size")
211                || lines[0].to_lowercase().contains("sequence")
212            {
213                return Ok(FileFormat::Tsv);
214            }
215        }
216    }
217
218    // CSV format: comma-separated
219    if lines.len() > 1 {
220        let first_line_cols = lines[0].split(',').count();
221        if first_line_cols > 2
222            && lines
223                .iter()
224                .take(5)
225                .all(|line| line.split(',').count() == first_line_cols)
226            && (lines[0].to_lowercase().contains("length")
227                || lines[0].to_lowercase().contains("size")
228                || lines[0].to_lowercase().contains("sequence"))
229        {
230            return Ok(FileFormat::Tsv);
231        }
232    }
233
234    // FAI format: exactly 5 tab-separated columns (name, length, offset, line_bases, line_width)
235    // All non-empty, non-comment lines should have 5 columns with numeric values in columns 2-5
236    if !lines.is_empty() {
237        let fai_lines: Vec<&&str> = lines
238            .iter()
239            .filter(|line| !line.is_empty() && !line.starts_with('#'))
240            .collect();
241
242        if !fai_lines.is_empty()
243            && fai_lines.iter().all(|line| {
244                let fields: Vec<&str> = line.split('\t').collect();
245                if fields.len() != 5 {
246                    return false;
247                }
248                // All fields after the first should be numeric
249                fields[1..].iter().all(|f| f.parse::<u64>().is_ok())
250            })
251        {
252            return Ok(FileFormat::Fai);
253        }
254    }
255
256    // If content looks like plain text with sequence-like data, assume SAM header
257    if lines.iter().any(|line| {
258        line.contains("chr")
259            || line.contains("scaffold")
260            || line.contains("contig")
261            || line.to_lowercase().contains("sequence")
262            || line.to_lowercase().contains("length")
263    }) {
264        return Ok(FileFormat::Sam);
265    }
266
267    Err(FormatError::UnknownFormat)
268}
269
270/// Validate that content matches the expected format
271#[allow(clippy::trivially_copy_pass_by_ref)] // Clearer API with reference
272fn validate_format_content(content: &str, format: &FileFormat) -> bool {
273    match format {
274        FileFormat::Sam => {
275            content.contains("@SQ") || content.contains("SN:") || content.contains("LN:")
276        }
277        FileFormat::Dict => content.contains("@HD") && content.contains("@SQ"),
278        FileFormat::Vcf => {
279            content.contains("##")
280                && (content.contains("##contig=") || content.contains("##fileformat=VCF"))
281        }
282        FileFormat::NcbiReport => {
283            content.contains("Sequence-Name") || content.contains("Sequence-Role")
284        }
285        FileFormat::Tsv => {
286            content.contains('\t')
287                && (content.to_lowercase().contains("length")
288                    || content.to_lowercase().contains("sequence"))
289        }
290        FileFormat::Fai => {
291            // FAI format has 5 tab-separated columns
292            let lines: Vec<&str> = content.lines().take(5).collect();
293            lines.iter().any(|line| {
294                let fields: Vec<&str> = line.split('\t').collect();
295                fields.len() == 5 && fields[1..].iter().all(|f| f.parse::<u64>().is_ok())
296            })
297        }
298        FileFormat::Bam | FileFormat::Cram | FileFormat::Fasta => {
299            // Binary formats should not be validated against text content
300            false
301        }
302        FileFormat::Auto => true, // Auto-detect always passes validation
303    }
304}
305
306/// Parse content with the specified format
307///
308/// # Errors
309///
310/// Returns `ParseError::ParseFailed` if the content cannot be parsed with the
311/// specified format.
312pub fn parse_with_format(content: &str, format: FileFormat) -> Result<QueryHeader, ParseError> {
313    match format {
314        FileFormat::Sam => {
315            crate::parsing::sam::parse_header_text(content).map_err(|e| ParseError::ParseFailed {
316                format: FileFormat::Sam,
317                message: e.to_string(),
318            })
319        }
320        FileFormat::Dict => {
321            crate::parsing::dict::parse_dict_text(content).map_err(|e| ParseError::ParseFailed {
322                format: FileFormat::Dict,
323                message: e.to_string(),
324            })
325        }
326        FileFormat::Vcf => crate::parsing::vcf::parse_vcf_header_text(content).map_err(|e| {
327            ParseError::ParseFailed {
328                format: FileFormat::Vcf,
329                message: e.to_string(),
330            }
331        }),
332        FileFormat::NcbiReport => {
333            // NCBI report parser returns Vec<NcbiContigEntry>, we need to convert
334            match crate::parsing::ncbi_report::parse_ncbi_report_text(content) {
335                Ok(entries) => {
336                    let contigs = entries.into_iter().map(|entry| entry.to_contig()).collect();
337                    Ok(crate::core::header::QueryHeader::new(contigs))
338                }
339                Err(e) => Err(ParseError::ParseFailed {
340                    format: FileFormat::NcbiReport,
341                    message: e.to_string(),
342                }),
343            }
344        }
345        FileFormat::Tsv => {
346            // TSV parser requires delimiter - try tab first, then comma
347            match crate::parsing::tsv::parse_tsv_text(content, '\t') {
348                Ok(query) => Ok(query),
349                Err(_) => crate::parsing::tsv::parse_tsv_text(content, ',').map_err(|e| {
350                    ParseError::ParseFailed {
351                        format: FileFormat::Tsv,
352                        message: format!("Failed to parse as TSV or CSV: {e}"),
353                    }
354                }),
355            }
356        }
357        FileFormat::Fai => {
358            crate::parsing::fai::parse_fai_text(content).map_err(|e| ParseError::ParseFailed {
359                format: FileFormat::Fai,
360                message: e.to_string(),
361            })
362        }
363        FileFormat::Bam => Err(ParseError::ParseFailed {
364            format: FileFormat::Bam,
365            message: "BAM files must be parsed as binary, not text".to_string(),
366        }),
367        FileFormat::Cram => Err(ParseError::ParseFailed {
368            format: FileFormat::Cram,
369            message: "CRAM files must be parsed as binary, not text".to_string(),
370        }),
371        FileFormat::Fasta => Err(ParseError::ParseFailed {
372            format: FileFormat::Fasta,
373            message: "FASTA files must be parsed as binary, not text".to_string(),
374        }),
375        FileFormat::Auto => {
376            // For auto-detection, detect format first then parse
377            let detected_format =
378                detect_format_from_content(content).map_err(|e| ParseError::ParseFailed {
379                    format: FileFormat::Auto,
380                    message: format!("Auto-detection failed: {e}"),
381                })?;
382
383            parse_with_format(content, detected_format)
384        }
385    }
386}
387
388/// Parse binary file content (for BAM/CRAM/FASTA files)
389///
390/// BAM and CRAM are parsed directly from memory via a `Cursor<&[u8]>`.
391/// FASTA still requires a temporary file since the parser must read full sequences
392/// to compute contig lengths.
393///
394/// # Errors
395///
396/// Returns `ParseError::Io` if a temporary file cannot be created or written (FASTA only),
397/// or `ParseError::ParseFailed` if parsing fails.
398pub fn parse_binary_file(
399    file_content: &[u8],
400    format: FileFormat,
401) -> Result<QueryHeader, ParseError> {
402    match format {
403        FileFormat::Bam => {
404            let cursor = std::io::Cursor::new(file_content);
405            crate::parsing::sam::parse_bam_from_reader(cursor).map_err(|e| {
406                ParseError::ParseFailed {
407                    format,
408                    message: format!("BAM file parsing failed: {e}"),
409                }
410            })
411        }
412        FileFormat::Cram => {
413            let cursor = std::io::Cursor::new(file_content);
414            crate::parsing::sam::parse_cram_from_reader(cursor).map_err(|e| {
415                ParseError::ParseFailed {
416                    format,
417                    message: format!("CRAM file parsing failed: {e}"),
418                }
419            })
420        }
421        FileFormat::Fasta => {
422            // FASTA still needs temp file — must read full sequences for lengths
423            use std::io::Write;
424            use tempfile::NamedTempFile;
425
426            let is_gzipped =
427                file_content.len() >= 2 && file_content[0] == 0x1f && file_content[1] == 0x8b;
428            let file_extension = if is_gzipped { ".fa.gz" } else { ".fa" };
429
430            let mut temp_file =
431                NamedTempFile::with_suffix(file_extension).map_err(ParseError::Io)?;
432            temp_file.write_all(file_content).map_err(ParseError::Io)?;
433
434            let result = crate::parsing::fasta::parse_fasta_file(temp_file.path());
435            result.map_err(|e| ParseError::ParseFailed {
436                format,
437                message: format!("FASTA file parsing failed: {e}"),
438            })
439        }
440        _ => Err(ParseError::ParseFailed {
441            format,
442            message: "Format is not a binary file format".to_string(),
443        }),
444    }
445}
446
447/// Parse a binary file directly from a file path (for streamed uploads).
448///
449/// Unlike [`parse_binary_file`], this function does not need to create a temporary file —
450/// the caller already has one.  The file may be truncated after the header; only the
451/// header portion is needed for BAM/CRAM.
452///
453/// # Errors
454///
455/// Returns `ParseError::ParseFailed` if parsing fails, or if the format is not a
456/// supported binary format.
457pub fn parse_binary_file_from_path(
458    path: &std::path::Path,
459    format: FileFormat,
460) -> Result<QueryHeader, ParseError> {
461    match format {
462        FileFormat::Bam | FileFormat::Cram => {
463            crate::parsing::sam::parse_file(path).map_err(|e| ParseError::ParseFailed {
464                format,
465                message: format!("Binary file parsing failed: {e}"),
466            })
467        }
468        FileFormat::Fasta => {
469            crate::parsing::fasta::parse_fasta_file(path).map_err(|e| ParseError::ParseFailed {
470                format,
471                message: format!("FASTA file parsing failed: {e}"),
472            })
473        }
474        _ => Err(ParseError::ParseFailed {
475            format,
476            message: "Format is not a binary file format".to_string(),
477        }),
478    }
479}
480
481#[cfg(test)]
482mod tests {
483    use super::*;
484
485    #[test]
486    fn test_filename_detection() {
487        assert_eq!(
488            detect_format_from_filename("test.sam"),
489            Some(FileFormat::Sam)
490        );
491        assert_eq!(
492            detect_format_from_filename("test.bam"),
493            Some(FileFormat::Bam)
494        );
495        assert_eq!(
496            detect_format_from_filename("test.dict"),
497            Some(FileFormat::Dict)
498        );
499        assert_eq!(
500            detect_format_from_filename("test.vcf"),
501            Some(FileFormat::Vcf)
502        );
503        assert_eq!(
504            detect_format_from_filename("test.vcf.gz"),
505            Some(FileFormat::Vcf)
506        );
507        assert_eq!(
508            detect_format_from_filename("assembly_report.txt"),
509            Some(FileFormat::NcbiReport)
510        );
511        assert_eq!(
512            detect_format_from_filename("reference.fai"),
513            Some(FileFormat::Fai)
514        );
515        assert_eq!(
516            detect_format_from_filename("reference.fa"),
517            Some(FileFormat::Fasta)
518        );
519        assert_eq!(
520            detect_format_from_filename("reference.fasta"),
521            Some(FileFormat::Fasta)
522        );
523        assert_eq!(
524            detect_format_from_filename("reference.fa.gz"),
525            Some(FileFormat::Fasta)
526        );
527        assert_eq!(
528            detect_format_from_filename("reference.fasta.gz"),
529            Some(FileFormat::Fasta)
530        );
531        assert_eq!(detect_format_from_filename("unknown.xyz"), None);
532    }
533
534    #[test]
535    fn test_sam_header_detection() {
536        let content = "@SQ\tSN:chr1\tLN:248956422\tM5:6aef897c3d6ff0c78aff06ac189178dd\n";
537        assert_eq!(detect_format_from_content(content), Ok(FileFormat::Sam));
538    }
539
540    #[test]
541    fn test_dict_detection() {
542        let content = "@HD\tVN:1.0\tSO:coordinate\n@SQ\tSN:chr1\tLN:248956422\tM5:abc123\n";
543        assert_eq!(detect_format_from_content(content), Ok(FileFormat::Dict));
544    }
545
546    #[test]
547    fn test_vcf_detection() {
548        let content = "##fileformat=VCFv4.2\n##contig=<ID=chr1,length=248956422>\n";
549        assert_eq!(detect_format_from_content(content), Ok(FileFormat::Vcf));
550    }
551
552    #[test]
553    fn test_ncbi_report_detection() {
554        let content =
555            "# Sequence-Name\tSequence-Role\tAssigned-Molecule\tAssigned-Molecule-Location/Type\n";
556        assert_eq!(
557            detect_format_from_content(content),
558            Ok(FileFormat::NcbiReport)
559        );
560    }
561
562    #[test]
563    fn test_fai_detection() {
564        let content = "chr1\t248956422\t112\t70\t71\nchr2\t242193529\t253404903\t70\t71\n";
565        assert_eq!(detect_format_from_content(content), Ok(FileFormat::Fai));
566    }
567
568    #[test]
569    fn test_fai_validation() {
570        assert!(validate_format_content(
571            "chr1\t248956422\t112\t70\t71",
572            &FileFormat::Fai
573        ));
574        assert!(!validate_format_content(
575            "chr1\t248956422\t112",
576            &FileFormat::Fai
577        ));
578    }
579
580    #[test]
581    fn test_format_validation() {
582        assert!(validate_format_content(
583            "@SQ\tSN:chr1\tLN:123",
584            &FileFormat::Sam
585        ));
586        assert!(!validate_format_content("random text", &FileFormat::Sam));
587
588        assert!(validate_format_content(
589            "##contig=<ID=chr1>",
590            &FileFormat::Vcf
591        ));
592        assert!(!validate_format_content("@SQ\tSN:chr1", &FileFormat::Vcf));
593    }
594
595    #[test]
596    fn test_sam_header_detection_with_spaces() {
597        let content = "@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd\n";
598        assert_eq!(detect_format_from_content(content), Ok(FileFormat::Sam));
599    }
600
601    #[test]
602    fn test_dict_detection_with_spaces() {
603        let content = "@HD VN:1.0 SO:coordinate\n@SQ SN:chr1 LN:248956422\n";
604        assert_eq!(detect_format_from_content(content), Ok(FileFormat::Dict));
605    }
606
607    #[test]
608    fn test_sam_validation_with_spaces() {
609        assert!(validate_format_content(
610            "@SQ SN:chr1 LN:123",
611            &FileFormat::Sam
612        ));
613    }
614
615    #[test]
616    fn test_combined_detection() {
617        let content = "@SQ\tSN:chr1\tLN:248956422\n";
618        assert_eq!(
619            detect_format(content, Some("test.sam")),
620            Ok(FileFormat::Sam)
621        );
622        assert_eq!(
623            detect_format(content, Some("test.dict")),
624            Ok(FileFormat::Sam)
625        ); // Content overrides filename
626        assert_eq!(detect_format(content, None), Ok(FileFormat::Sam));
627    }
628
629    #[test]
630    fn test_parse_binary_file_bam_from_bytes() {
631        use noodles::bam;
632        use noodles::sam;
633        use noodles::sam::header::record::value::map::{Map, ReferenceSequence};
634        use std::num::NonZeroUsize;
635
636        let header = sam::Header::builder()
637            .add_reference_sequence(
638                "chr1",
639                Map::<ReferenceSequence>::new(NonZeroUsize::new(248_956_422).unwrap()),
640            )
641            .build();
642        let mut bam_bytes = Vec::new();
643        {
644            let mut writer = bam::io::Writer::new(&mut bam_bytes);
645            writer.write_header(&header).unwrap();
646        }
647
648        let query = parse_binary_file(&bam_bytes, FileFormat::Bam).unwrap();
649        assert_eq!(query.contigs.len(), 1);
650        assert_eq!(query.contigs[0].name, "chr1");
651        assert_eq!(query.contigs[0].length, 248_956_422);
652    }
653
654    /// Helper: build a minimal BAM byte buffer from a SAM header string.
655    fn build_bam_bytes(header_text: &str) -> Vec<u8> {
656        use noodles::bam;
657        use noodles::sam;
658
659        let mut reader = sam::io::Reader::new(header_text.as_bytes());
660        let header = reader.read_header().unwrap();
661
662        let mut buf = Vec::new();
663        {
664            let mut writer = bam::io::Writer::new(&mut buf);
665            writer.write_header(&header).unwrap();
666        }
667        buf
668    }
669
670    #[test]
671    fn test_parse_binary_file_from_path_bam() {
672        use std::io::Write;
673        use tempfile::NamedTempFile;
674
675        let bam_bytes = build_bam_bytes(
676            "@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n@SQ\tSN:chr2\tLN:242193529\n",
677        );
678
679        let mut temp = NamedTempFile::with_suffix(".bam").unwrap();
680        temp.write_all(&bam_bytes).unwrap();
681
682        let result = parse_binary_file_from_path(temp.path(), FileFormat::Bam);
683        assert!(result.is_ok());
684        let query = result.unwrap();
685        assert_eq!(query.contigs.len(), 2);
686        assert_eq!(query.contigs[0].name, "chr1");
687        assert_eq!(query.contigs[0].length, 248_956_422);
688        assert_eq!(query.contigs[1].name, "chr2");
689        assert_eq!(query.contigs[1].length, 242_193_529);
690    }
691
692    #[test]
693    fn test_parse_binary_file_from_path_truncated_bam() {
694        // Verify that parsing works on a BAM file truncated after the header.
695        // This simulates the server-side streaming behavior where only the first
696        // N bytes of a large BAM are written to a temp file.
697        use std::io::Write;
698        use tempfile::NamedTempFile;
699
700        let mut bam_bytes = build_bam_bytes("@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n");
701
702        // Append junk data to simulate a truncated file (records cut off mid-stream)
703        bam_bytes.extend_from_slice(&[0u8; 1024]);
704
705        let mut temp = NamedTempFile::with_suffix(".bam").unwrap();
706        temp.write_all(&bam_bytes).unwrap();
707
708        let result = parse_binary_file_from_path(temp.path(), FileFormat::Bam);
709        assert!(result.is_ok());
710        let query = result.unwrap();
711        assert_eq!(query.contigs.len(), 1);
712        assert_eq!(query.contigs[0].name, "chr1");
713    }
714
715    #[test]
716    fn test_parse_binary_file_from_path_unsupported_format() {
717        use std::io::Write;
718        use tempfile::NamedTempFile;
719
720        let mut temp = NamedTempFile::with_suffix(".txt").unwrap();
721        temp.write_all(b"not a binary file").unwrap();
722
723        let result = parse_binary_file_from_path(temp.path(), FileFormat::Sam);
724        assert!(result.is_err());
725    }
726}