Skip to main content

ref_solver/web/
format_detection.rs

1use crate::core::header::QueryHeader;
2use std::path::Path;
3
4/// Supported file formats for reference identification
5#[derive(Debug, Clone, Copy, PartialEq)]
6pub enum FileFormat {
7    /// SAM/BAM/CRAM files or plain text headers
8    Sam,
9    /// BAM binary format
10    Bam,
11    /// CRAM binary format
12    Cram,
13    /// Picard sequence dictionary files
14    Dict,
15    /// VCF files with contig headers
16    Vcf,
17    /// NCBI assembly report files
18    NcbiReport,
19    /// TSV/CSV tabular files
20    Tsv,
21    /// FASTA index (.fai) files
22    Fai,
23    /// FASTA sequence files
24    Fasta,
25    /// Automatically detect format
26    Auto,
27}
28
29/// Errors that can occur during format detection
30#[derive(Debug, PartialEq, thiserror::Error)]
31pub enum FormatError {
32    #[error("Unable to detect file format from content and filename")]
33    UnknownFormat,
34    #[error("File appears to be binary but cannot determine specific format")]
35    UnsupportedBinary,
36}
37
38/// Errors that can occur during parsing
39#[derive(Debug, thiserror::Error)]
40pub enum ParseError {
41    #[error("Failed to parse {format:?} content: {message}")]
42    ParseFailed { format: FileFormat, message: String },
43    #[error("IO error: {0}")]
44    Io(#[from] std::io::Error),
45}
46
47impl FileFormat {
48    /// Get the display name for this format
49    #[must_use]
50    #[allow(clippy::trivially_copy_pass_by_ref)] // Idiomatic method signature
51    pub fn display_name(&self) -> &'static str {
52        match self {
53            FileFormat::Sam => "SAM/BAM Header",
54            FileFormat::Bam => "BAM File",
55            FileFormat::Cram => "CRAM File",
56            FileFormat::Dict => "Sequence Dictionary",
57            FileFormat::Vcf => "VCF File",
58            FileFormat::NcbiReport => "NCBI Assembly Report",
59            FileFormat::Tsv => "TSV/CSV Table",
60            FileFormat::Fai => "FASTA Index",
61            FileFormat::Fasta => "FASTA File",
62            FileFormat::Auto => "Auto-detect",
63        }
64    }
65}
66
67/// Detect file format from content and optional filename
68///
69/// # Errors
70///
71/// Returns `FormatError::UnknownFormat` if the format cannot be detected, or
72/// `FormatError::UnsupportedBinary` if the file appears to be binary but the
73/// specific format cannot be determined.
74pub fn detect_format(content: &str, filename: Option<&str>) -> Result<FileFormat, FormatError> {
75    // First try filename-based detection if available
76    if let Some(name) = filename {
77        if let Some(format) = detect_format_from_filename(name) {
78            // For binary formats, trust filename-based detection without content validation
79            if matches!(format, FileFormat::Bam | FileFormat::Cram) {
80                return Ok(format);
81            }
82            // For text formats, validate that content matches expected format
83            if validate_format_content(content, &format) {
84                return Ok(format);
85            }
86        }
87    }
88
89    // Fall back to content-based detection
90    detect_format_from_content(content)
91}
92
93/// Detect format based on filename and extension
94fn detect_format_from_filename(filename: &str) -> Option<FileFormat> {
95    let path = Path::new(filename);
96    let lower_name = filename.to_lowercase();
97
98    // Check for compressed formats first (multi-extension patterns)
99    if lower_name.ends_with(".vcf.gz") {
100        return Some(FileFormat::Vcf);
101    }
102    if lower_name.ends_with(".fa.gz")
103        || lower_name.ends_with(".fasta.gz")
104        || lower_name.ends_with(".fna.gz")
105        || lower_name.ends_with(".fa.bgz")
106        || lower_name.ends_with(".fasta.bgz")
107        || lower_name.ends_with(".fna.bgz")
108    {
109        return Some(FileFormat::Fasta);
110    }
111
112    let extension = path.extension()?.to_str()?.to_lowercase();
113
114    match extension.as_str() {
115        "sam" => Some(FileFormat::Sam),
116        "bam" => Some(FileFormat::Bam),
117        "cram" => Some(FileFormat::Cram),
118        "dict" => Some(FileFormat::Dict),
119        "vcf" => Some(FileFormat::Vcf),
120        "fai" => Some(FileFormat::Fai),
121        "fa" | "fasta" | "fna" => Some(FileFormat::Fasta),
122        "tsv" | "csv" => Some(FileFormat::Tsv),
123        "txt" => {
124            // Disambiguate .txt files based on filename patterns
125            if lower_name.contains("assembly") || lower_name.contains("report") {
126                Some(FileFormat::NcbiReport)
127            } else if lower_name.ends_with(".dict.txt") {
128                Some(FileFormat::Dict)
129            } else {
130                // Default to SAM for .txt files
131                Some(FileFormat::Sam)
132            }
133        }
134        _ => None,
135    }
136}
137
138/// Detect format from file content analysis
139fn detect_format_from_content(content: &str) -> Result<FileFormat, FormatError> {
140    let content_trimmed = content.trim();
141
142    // Check for empty content
143    if content_trimmed.is_empty() {
144        return Err(FormatError::UnknownFormat);
145    }
146
147    // Check for binary content (non-UTF8 or control characters)
148    if content
149        .chars()
150        .any(|c| c.is_control() && c != '\n' && c != '\r' && c != '\t')
151    {
152        return Err(FormatError::UnsupportedBinary);
153    }
154
155    let lines: Vec<&str> = content_trimmed.lines().take(20).collect(); // Sample first 20 lines
156
157    // Picard dictionary: starts with @HD and has @SQ lines (check BEFORE Sam)
158    if lines.iter().any(|line| line.starts_with("@HD\t"))
159        && lines.iter().any(|line| line.starts_with("@SQ\t"))
160    {
161        return Ok(FileFormat::Dict);
162    }
163
164    // SAM header format: starts with @SQ lines
165    if lines.iter().any(|line| line.starts_with("@SQ\t")) {
166        return Ok(FileFormat::Sam);
167    }
168
169    // VCF format: starts with ## comments and has ##contig lines
170    if lines
171        .iter()
172        .any(|line| line.starts_with("##fileformat=VCF"))
173        || (lines.iter().any(|line| line.starts_with("##"))
174            && lines.iter().any(|line| line.starts_with("##contig=")))
175    {
176        return Ok(FileFormat::Vcf);
177    }
178
179    // NCBI assembly report: has specific column headers (check all lines for header)
180    if lines.iter().any(|line| {
181        line.contains("Sequence-Name")
182            && line.contains("Sequence-Role")
183            && line.contains("Assigned-Molecule")
184    }) {
185        return Ok(FileFormat::NcbiReport);
186    }
187
188    // TSV format: tab-separated with consistent column count
189    if lines.len() > 1 {
190        let first_line_cols = lines[0].split('\t').count();
191        if first_line_cols > 2
192            && lines
193                .iter()
194                .take(5)
195                .all(|line| line.split('\t').count() == first_line_cols)
196        {
197            // Check if it looks like sequence data (has length/size columns)
198            if lines[0].to_lowercase().contains("length")
199                || lines[0].to_lowercase().contains("size")
200                || lines[0].to_lowercase().contains("sequence")
201            {
202                return Ok(FileFormat::Tsv);
203            }
204        }
205    }
206
207    // CSV format: comma-separated
208    if lines.len() > 1 {
209        let first_line_cols = lines[0].split(',').count();
210        if first_line_cols > 2
211            && lines
212                .iter()
213                .take(5)
214                .all(|line| line.split(',').count() == first_line_cols)
215            && (lines[0].to_lowercase().contains("length")
216                || lines[0].to_lowercase().contains("size")
217                || lines[0].to_lowercase().contains("sequence"))
218        {
219            return Ok(FileFormat::Tsv);
220        }
221    }
222
223    // FAI format: exactly 5 tab-separated columns (name, length, offset, line_bases, line_width)
224    // All non-empty, non-comment lines should have 5 columns with numeric values in columns 2-5
225    if !lines.is_empty() {
226        let fai_lines: Vec<&&str> = lines
227            .iter()
228            .filter(|line| !line.is_empty() && !line.starts_with('#'))
229            .collect();
230
231        if !fai_lines.is_empty()
232            && fai_lines.iter().all(|line| {
233                let fields: Vec<&str> = line.split('\t').collect();
234                if fields.len() != 5 {
235                    return false;
236                }
237                // All fields after the first should be numeric
238                fields[1..].iter().all(|f| f.parse::<u64>().is_ok())
239            })
240        {
241            return Ok(FileFormat::Fai);
242        }
243    }
244
245    // If content looks like plain text with sequence-like data, assume SAM header
246    if lines.iter().any(|line| {
247        line.contains("chr")
248            || line.contains("scaffold")
249            || line.contains("contig")
250            || line.to_lowercase().contains("sequence")
251            || line.to_lowercase().contains("length")
252    }) {
253        return Ok(FileFormat::Sam);
254    }
255
256    Err(FormatError::UnknownFormat)
257}
258
259/// Validate that content matches the expected format
260#[allow(clippy::trivially_copy_pass_by_ref)] // Clearer API with reference
261fn validate_format_content(content: &str, format: &FileFormat) -> bool {
262    match format {
263        FileFormat::Sam => {
264            content.contains("@SQ") || content.contains("SN:") || content.contains("LN:")
265        }
266        FileFormat::Dict => content.contains("@HD") && content.contains("@SQ"),
267        FileFormat::Vcf => {
268            content.contains("##")
269                && (content.contains("##contig=") || content.contains("##fileformat=VCF"))
270        }
271        FileFormat::NcbiReport => {
272            content.contains("Sequence-Name") || content.contains("Sequence-Role")
273        }
274        FileFormat::Tsv => {
275            content.contains('\t')
276                && (content.to_lowercase().contains("length")
277                    || content.to_lowercase().contains("sequence"))
278        }
279        FileFormat::Fai => {
280            // FAI format has 5 tab-separated columns
281            let lines: Vec<&str> = content.lines().take(5).collect();
282            lines.iter().any(|line| {
283                let fields: Vec<&str> = line.split('\t').collect();
284                fields.len() == 5 && fields[1..].iter().all(|f| f.parse::<u64>().is_ok())
285            })
286        }
287        FileFormat::Bam | FileFormat::Cram | FileFormat::Fasta => {
288            // Binary formats should not be validated against text content
289            false
290        }
291        FileFormat::Auto => true, // Auto-detect always passes validation
292    }
293}
294
295/// Parse content with the specified format
296///
297/// # Errors
298///
299/// Returns `ParseError::ParseFailed` if the content cannot be parsed with the
300/// specified format.
301pub fn parse_with_format(content: &str, format: FileFormat) -> Result<QueryHeader, ParseError> {
302    match format {
303        FileFormat::Sam => {
304            crate::parsing::sam::parse_header_text(content).map_err(|e| ParseError::ParseFailed {
305                format: FileFormat::Sam,
306                message: e.to_string(),
307            })
308        }
309        FileFormat::Dict => {
310            crate::parsing::dict::parse_dict_text(content).map_err(|e| ParseError::ParseFailed {
311                format: FileFormat::Dict,
312                message: e.to_string(),
313            })
314        }
315        FileFormat::Vcf => crate::parsing::vcf::parse_vcf_header_text(content).map_err(|e| {
316            ParseError::ParseFailed {
317                format: FileFormat::Vcf,
318                message: e.to_string(),
319            }
320        }),
321        FileFormat::NcbiReport => {
322            // NCBI report parser returns Vec<NcbiContigEntry>, we need to convert
323            match crate::parsing::ncbi_report::parse_ncbi_report_text(content) {
324                Ok(entries) => {
325                    let contigs = entries.into_iter().map(|entry| entry.to_contig()).collect();
326                    Ok(crate::core::header::QueryHeader::new(contigs))
327                }
328                Err(e) => Err(ParseError::ParseFailed {
329                    format: FileFormat::NcbiReport,
330                    message: e.to_string(),
331                }),
332            }
333        }
334        FileFormat::Tsv => {
335            // TSV parser requires delimiter - try tab first, then comma
336            match crate::parsing::tsv::parse_tsv_text(content, '\t') {
337                Ok(query) => Ok(query),
338                Err(_) => crate::parsing::tsv::parse_tsv_text(content, ',').map_err(|e| {
339                    ParseError::ParseFailed {
340                        format: FileFormat::Tsv,
341                        message: format!("Failed to parse as TSV or CSV: {e}"),
342                    }
343                }),
344            }
345        }
346        FileFormat::Fai => {
347            crate::parsing::fai::parse_fai_text(content).map_err(|e| ParseError::ParseFailed {
348                format: FileFormat::Fai,
349                message: e.to_string(),
350            })
351        }
352        FileFormat::Bam => Err(ParseError::ParseFailed {
353            format: FileFormat::Bam,
354            message: "BAM files must be parsed as binary, not text".to_string(),
355        }),
356        FileFormat::Cram => Err(ParseError::ParseFailed {
357            format: FileFormat::Cram,
358            message: "CRAM files must be parsed as binary, not text".to_string(),
359        }),
360        FileFormat::Fasta => Err(ParseError::ParseFailed {
361            format: FileFormat::Fasta,
362            message: "FASTA files must be parsed as binary, not text".to_string(),
363        }),
364        FileFormat::Auto => {
365            // For auto-detection, detect format first then parse
366            let detected_format =
367                detect_format_from_content(content).map_err(|e| ParseError::ParseFailed {
368                    format: FileFormat::Auto,
369                    message: format!("Auto-detection failed: {e}"),
370                })?;
371
372            parse_with_format(content, detected_format)
373        }
374    }
375}
376
377/// Parse binary file content (for BAM/CRAM files)
378///
379/// Note: This creates a secure temporary file since the underlying parsers expect file paths
380///
381/// # Errors
382///
383/// Returns `ParseError::Io` if the temporary file cannot be created or written,
384/// or `ParseError::ParseFailed` if parsing fails.
385pub fn parse_binary_file(
386    file_content: &[u8],
387    format: FileFormat,
388) -> Result<QueryHeader, ParseError> {
389    use std::io::Write;
390    use tempfile::NamedTempFile;
391
392    match format {
393        FileFormat::Bam | FileFormat::Cram => {
394            // Create a secure temporary file with cryptographically random name
395            let file_extension = match format {
396                FileFormat::Bam => ".bam",
397                FileFormat::Cram => ".cram",
398                _ => ".bin",
399            };
400
401            let mut temp_file =
402                NamedTempFile::with_suffix(file_extension).map_err(ParseError::Io)?;
403
404            // Write bytes to secure temporary file
405            temp_file.write_all(file_content).map_err(ParseError::Io)?;
406
407            // Parse the file using the secure temp file path
408            let result = crate::parsing::sam::parse_file(temp_file.path());
409
410            // File automatically deleted when NamedTempFile drops
411            result.map_err(|_e| ParseError::ParseFailed {
412                format,
413                message: "Binary file parsing failed".to_string(), // Sanitized error message
414            })
415        }
416        FileFormat::Fasta => {
417            // Determine appropriate extension based on content (check for gzip magic bytes)
418            let is_gzipped =
419                file_content.len() >= 2 && file_content[0] == 0x1f && file_content[1] == 0x8b;
420            let file_extension = if is_gzipped { ".fa.gz" } else { ".fa" };
421
422            let mut temp_file =
423                NamedTempFile::with_suffix(file_extension).map_err(ParseError::Io)?;
424
425            // Write bytes to secure temporary file
426            temp_file.write_all(file_content).map_err(ParseError::Io)?;
427
428            // Parse the FASTA file using the secure temp file path
429            let result = crate::parsing::fasta::parse_fasta_file(temp_file.path());
430
431            // File automatically deleted when NamedTempFile drops
432            result.map_err(|_e| ParseError::ParseFailed {
433                format,
434                message: "Binary file parsing failed".to_string(), // Sanitized error message
435            })
436        }
437        _ => Err(ParseError::ParseFailed {
438            format,
439            message: "Format is not a binary file format".to_string(),
440        }),
441    }
442}
443
444#[cfg(test)]
445mod tests {
446    use super::*;
447
448    #[test]
449    fn test_filename_detection() {
450        assert_eq!(
451            detect_format_from_filename("test.sam"),
452            Some(FileFormat::Sam)
453        );
454        assert_eq!(
455            detect_format_from_filename("test.bam"),
456            Some(FileFormat::Bam)
457        );
458        assert_eq!(
459            detect_format_from_filename("test.dict"),
460            Some(FileFormat::Dict)
461        );
462        assert_eq!(
463            detect_format_from_filename("test.vcf"),
464            Some(FileFormat::Vcf)
465        );
466        assert_eq!(
467            detect_format_from_filename("test.vcf.gz"),
468            Some(FileFormat::Vcf)
469        );
470        assert_eq!(
471            detect_format_from_filename("assembly_report.txt"),
472            Some(FileFormat::NcbiReport)
473        );
474        assert_eq!(
475            detect_format_from_filename("reference.fai"),
476            Some(FileFormat::Fai)
477        );
478        assert_eq!(
479            detect_format_from_filename("reference.fa"),
480            Some(FileFormat::Fasta)
481        );
482        assert_eq!(
483            detect_format_from_filename("reference.fasta"),
484            Some(FileFormat::Fasta)
485        );
486        assert_eq!(
487            detect_format_from_filename("reference.fa.gz"),
488            Some(FileFormat::Fasta)
489        );
490        assert_eq!(
491            detect_format_from_filename("reference.fasta.gz"),
492            Some(FileFormat::Fasta)
493        );
494        assert_eq!(detect_format_from_filename("unknown.xyz"), None);
495    }
496
497    #[test]
498    fn test_sam_header_detection() {
499        let content = "@SQ\tSN:chr1\tLN:248956422\tM5:6aef897c3d6ff0c78aff06ac189178dd\n";
500        assert_eq!(detect_format_from_content(content), Ok(FileFormat::Sam));
501    }
502
503    #[test]
504    fn test_dict_detection() {
505        let content = "@HD\tVN:1.0\tSO:coordinate\n@SQ\tSN:chr1\tLN:248956422\tM5:abc123\n";
506        assert_eq!(detect_format_from_content(content), Ok(FileFormat::Dict));
507    }
508
509    #[test]
510    fn test_vcf_detection() {
511        let content = "##fileformat=VCFv4.2\n##contig=<ID=chr1,length=248956422>\n";
512        assert_eq!(detect_format_from_content(content), Ok(FileFormat::Vcf));
513    }
514
515    #[test]
516    fn test_ncbi_report_detection() {
517        let content =
518            "# Sequence-Name\tSequence-Role\tAssigned-Molecule\tAssigned-Molecule-Location/Type\n";
519        assert_eq!(
520            detect_format_from_content(content),
521            Ok(FileFormat::NcbiReport)
522        );
523    }
524
525    #[test]
526    fn test_fai_detection() {
527        let content = "chr1\t248956422\t112\t70\t71\nchr2\t242193529\t253404903\t70\t71\n";
528        assert_eq!(detect_format_from_content(content), Ok(FileFormat::Fai));
529    }
530
531    #[test]
532    fn test_fai_validation() {
533        assert!(validate_format_content(
534            "chr1\t248956422\t112\t70\t71",
535            &FileFormat::Fai
536        ));
537        assert!(!validate_format_content(
538            "chr1\t248956422\t112",
539            &FileFormat::Fai
540        ));
541    }
542
543    #[test]
544    fn test_format_validation() {
545        assert!(validate_format_content(
546            "@SQ\tSN:chr1\tLN:123",
547            &FileFormat::Sam
548        ));
549        assert!(!validate_format_content("random text", &FileFormat::Sam));
550
551        assert!(validate_format_content(
552            "##contig=<ID=chr1>",
553            &FileFormat::Vcf
554        ));
555        assert!(!validate_format_content("@SQ\tSN:chr1", &FileFormat::Vcf));
556    }
557
558    #[test]
559    fn test_combined_detection() {
560        let content = "@SQ\tSN:chr1\tLN:248956422\n";
561        assert_eq!(
562            detect_format(content, Some("test.sam")),
563            Ok(FileFormat::Sam)
564        );
565        assert_eq!(
566            detect_format(content, Some("test.dict")),
567            Ok(FileFormat::Sam)
568        ); // Content overrides filename
569        assert_eq!(detect_format(content, None), Ok(FileFormat::Sam));
570    }
571}