use crate::core::header::QueryHeader;
use std::path::Path;
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum FileFormat {
Sam,
Bam,
Cram,
Dict,
Vcf,
NcbiReport,
Tsv,
Fai,
Fasta,
Auto,
}
#[derive(Debug, PartialEq, thiserror::Error)]
pub enum FormatError {
#[error("Unable to detect file format from content and filename")]
UnknownFormat,
#[error("File appears to be binary but cannot determine specific format")]
UnsupportedBinary,
}
#[derive(Debug, thiserror::Error)]
pub enum ParseError {
#[error("Failed to parse {format:?} content: {message}")]
ParseFailed { format: FileFormat, message: String },
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
}
impl FileFormat {
#[must_use]
#[allow(clippy::trivially_copy_pass_by_ref)] pub fn display_name(&self) -> &'static str {
match self {
FileFormat::Sam => "SAM/BAM Header",
FileFormat::Bam => "BAM File",
FileFormat::Cram => "CRAM File",
FileFormat::Dict => "Sequence Dictionary",
FileFormat::Vcf => "VCF File",
FileFormat::NcbiReport => "NCBI Assembly Report",
FileFormat::Tsv => "TSV/CSV Table",
FileFormat::Fai => "FASTA Index",
FileFormat::Fasta => "FASTA File",
FileFormat::Auto => "Auto-detect",
}
}
}
pub fn detect_format(content: &str, filename: Option<&str>) -> Result<FileFormat, FormatError> {
if let Some(name) = filename {
if let Some(format) = detect_format_from_filename(name) {
if matches!(format, FileFormat::Bam | FileFormat::Cram) {
return Ok(format);
}
if validate_format_content(content, &format) {
return Ok(format);
}
}
}
detect_format_from_content(content)
}
fn detect_format_from_filename(filename: &str) -> Option<FileFormat> {
let path = Path::new(filename);
let lower_name = filename.to_lowercase();
if lower_name.ends_with(".vcf.gz") {
return Some(FileFormat::Vcf);
}
if lower_name.ends_with(".fa.gz")
|| lower_name.ends_with(".fasta.gz")
|| lower_name.ends_with(".fna.gz")
|| lower_name.ends_with(".fa.bgz")
|| lower_name.ends_with(".fasta.bgz")
|| lower_name.ends_with(".fna.bgz")
{
return Some(FileFormat::Fasta);
}
let extension = path.extension()?.to_str()?.to_lowercase();
match extension.as_str() {
"sam" => Some(FileFormat::Sam),
"bam" => Some(FileFormat::Bam),
"cram" => Some(FileFormat::Cram),
"dict" => Some(FileFormat::Dict),
"vcf" => Some(FileFormat::Vcf),
"fai" => Some(FileFormat::Fai),
"fa" | "fasta" | "fna" => Some(FileFormat::Fasta),
"tsv" | "csv" => Some(FileFormat::Tsv),
"txt" => {
if lower_name.contains("assembly") || lower_name.contains("report") {
Some(FileFormat::NcbiReport)
} else if lower_name.ends_with(".dict.txt") {
Some(FileFormat::Dict)
} else {
Some(FileFormat::Sam)
}
}
_ => None,
}
}
fn is_sam_record(line: &str, prefix: &str) -> bool {
line.starts_with(prefix)
&& line
.as_bytes()
.get(prefix.len())
.is_some_and(|&b| b == b'\t' || b == b' ')
}
fn detect_format_from_content(content: &str) -> Result<FileFormat, FormatError> {
let content_trimmed = content.trim();
if content_trimmed.is_empty() {
return Err(FormatError::UnknownFormat);
}
if content
.chars()
.any(|c| c.is_control() && c != '\n' && c != '\r' && c != '\t')
{
return Err(FormatError::UnsupportedBinary);
}
let lines: Vec<&str> = content_trimmed.lines().take(20).collect();
if lines.iter().any(|line| is_sam_record(line, "@HD"))
&& lines.iter().any(|line| is_sam_record(line, "@SQ"))
{
return Ok(FileFormat::Dict);
}
if lines.iter().any(|line| is_sam_record(line, "@SQ")) {
return Ok(FileFormat::Sam);
}
if lines
.iter()
.any(|line| line.starts_with("##fileformat=VCF"))
|| (lines.iter().any(|line| line.starts_with("##"))
&& lines.iter().any(|line| line.starts_with("##contig=")))
{
return Ok(FileFormat::Vcf);
}
if lines.iter().any(|line| {
line.contains("Sequence-Name")
&& line.contains("Sequence-Role")
&& line.contains("Assigned-Molecule")
}) {
return Ok(FileFormat::NcbiReport);
}
if lines.len() > 1 {
let first_line_cols = lines[0].split('\t').count();
if first_line_cols > 2
&& lines
.iter()
.take(5)
.all(|line| line.split('\t').count() == first_line_cols)
{
if lines[0].to_lowercase().contains("length")
|| lines[0].to_lowercase().contains("size")
|| lines[0].to_lowercase().contains("sequence")
{
return Ok(FileFormat::Tsv);
}
}
}
if lines.len() > 1 {
let first_line_cols = lines[0].split(',').count();
if first_line_cols > 2
&& lines
.iter()
.take(5)
.all(|line| line.split(',').count() == first_line_cols)
&& (lines[0].to_lowercase().contains("length")
|| lines[0].to_lowercase().contains("size")
|| lines[0].to_lowercase().contains("sequence"))
{
return Ok(FileFormat::Tsv);
}
}
if !lines.is_empty() {
let fai_lines: Vec<&&str> = lines
.iter()
.filter(|line| !line.is_empty() && !line.starts_with('#'))
.collect();
if !fai_lines.is_empty()
&& fai_lines.iter().all(|line| {
let fields: Vec<&str> = line.split('\t').collect();
if fields.len() != 5 {
return false;
}
fields[1..].iter().all(|f| f.parse::<u64>().is_ok())
})
{
return Ok(FileFormat::Fai);
}
}
if lines.iter().any(|line| {
line.contains("chr")
|| line.contains("scaffold")
|| line.contains("contig")
|| line.to_lowercase().contains("sequence")
|| line.to_lowercase().contains("length")
}) {
return Ok(FileFormat::Sam);
}
Err(FormatError::UnknownFormat)
}
#[allow(clippy::trivially_copy_pass_by_ref)] fn validate_format_content(content: &str, format: &FileFormat) -> bool {
match format {
FileFormat::Sam => {
content.contains("@SQ") || content.contains("SN:") || content.contains("LN:")
}
FileFormat::Dict => content.contains("@HD") && content.contains("@SQ"),
FileFormat::Vcf => {
content.contains("##")
&& (content.contains("##contig=") || content.contains("##fileformat=VCF"))
}
FileFormat::NcbiReport => {
content.contains("Sequence-Name") || content.contains("Sequence-Role")
}
FileFormat::Tsv => {
content.contains('\t')
&& (content.to_lowercase().contains("length")
|| content.to_lowercase().contains("sequence"))
}
FileFormat::Fai => {
let lines: Vec<&str> = content.lines().take(5).collect();
lines.iter().any(|line| {
let fields: Vec<&str> = line.split('\t').collect();
fields.len() == 5 && fields[1..].iter().all(|f| f.parse::<u64>().is_ok())
})
}
FileFormat::Bam | FileFormat::Cram | FileFormat::Fasta => {
false
}
FileFormat::Auto => true, }
}
pub fn parse_with_format(content: &str, format: FileFormat) -> Result<QueryHeader, ParseError> {
match format {
FileFormat::Sam => {
crate::parsing::sam::parse_header_text(content).map_err(|e| ParseError::ParseFailed {
format: FileFormat::Sam,
message: e.to_string(),
})
}
FileFormat::Dict => {
crate::parsing::dict::parse_dict_text(content).map_err(|e| ParseError::ParseFailed {
format: FileFormat::Dict,
message: e.to_string(),
})
}
FileFormat::Vcf => crate::parsing::vcf::parse_vcf_header_text(content).map_err(|e| {
ParseError::ParseFailed {
format: FileFormat::Vcf,
message: e.to_string(),
}
}),
FileFormat::NcbiReport => {
match crate::parsing::ncbi_report::parse_ncbi_report_text(content) {
Ok(entries) => {
let contigs = entries.into_iter().map(|entry| entry.to_contig()).collect();
Ok(crate::core::header::QueryHeader::new(contigs))
}
Err(e) => Err(ParseError::ParseFailed {
format: FileFormat::NcbiReport,
message: e.to_string(),
}),
}
}
FileFormat::Tsv => {
match crate::parsing::tsv::parse_tsv_text(content, '\t') {
Ok(query) => Ok(query),
Err(_) => crate::parsing::tsv::parse_tsv_text(content, ',').map_err(|e| {
ParseError::ParseFailed {
format: FileFormat::Tsv,
message: format!("Failed to parse as TSV or CSV: {e}"),
}
}),
}
}
FileFormat::Fai => {
crate::parsing::fai::parse_fai_text(content).map_err(|e| ParseError::ParseFailed {
format: FileFormat::Fai,
message: e.to_string(),
})
}
FileFormat::Bam => Err(ParseError::ParseFailed {
format: FileFormat::Bam,
message: "BAM files must be parsed as binary, not text".to_string(),
}),
FileFormat::Cram => Err(ParseError::ParseFailed {
format: FileFormat::Cram,
message: "CRAM files must be parsed as binary, not text".to_string(),
}),
FileFormat::Fasta => Err(ParseError::ParseFailed {
format: FileFormat::Fasta,
message: "FASTA files must be parsed as binary, not text".to_string(),
}),
FileFormat::Auto => {
let detected_format =
detect_format_from_content(content).map_err(|e| ParseError::ParseFailed {
format: FileFormat::Auto,
message: format!("Auto-detection failed: {e}"),
})?;
parse_with_format(content, detected_format)
}
}
}
pub fn parse_binary_file(
file_content: &[u8],
format: FileFormat,
) -> Result<QueryHeader, ParseError> {
match format {
FileFormat::Bam => {
let cursor = std::io::Cursor::new(file_content);
crate::parsing::sam::parse_bam_from_reader(cursor).map_err(|e| {
ParseError::ParseFailed {
format,
message: format!("BAM file parsing failed: {e}"),
}
})
}
FileFormat::Cram => {
let cursor = std::io::Cursor::new(file_content);
crate::parsing::sam::parse_cram_from_reader(cursor).map_err(|e| {
ParseError::ParseFailed {
format,
message: format!("CRAM file parsing failed: {e}"),
}
})
}
FileFormat::Fasta => {
use std::io::Write;
use tempfile::NamedTempFile;
let is_gzipped =
file_content.len() >= 2 && file_content[0] == 0x1f && file_content[1] == 0x8b;
let file_extension = if is_gzipped { ".fa.gz" } else { ".fa" };
let mut temp_file =
NamedTempFile::with_suffix(file_extension).map_err(ParseError::Io)?;
temp_file.write_all(file_content).map_err(ParseError::Io)?;
let result = crate::parsing::fasta::parse_fasta_file(temp_file.path());
result.map_err(|e| ParseError::ParseFailed {
format,
message: format!("FASTA file parsing failed: {e}"),
})
}
_ => Err(ParseError::ParseFailed {
format,
message: "Format is not a binary file format".to_string(),
}),
}
}
pub fn parse_binary_file_from_path(
path: &std::path::Path,
format: FileFormat,
) -> Result<QueryHeader, ParseError> {
match format {
FileFormat::Bam | FileFormat::Cram => {
crate::parsing::sam::parse_file(path).map_err(|e| ParseError::ParseFailed {
format,
message: format!("Binary file parsing failed: {e}"),
})
}
FileFormat::Fasta => {
crate::parsing::fasta::parse_fasta_file(path).map_err(|e| ParseError::ParseFailed {
format,
message: format!("FASTA file parsing failed: {e}"),
})
}
_ => Err(ParseError::ParseFailed {
format,
message: "Format is not a binary file format".to_string(),
}),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_filename_detection() {
assert_eq!(
detect_format_from_filename("test.sam"),
Some(FileFormat::Sam)
);
assert_eq!(
detect_format_from_filename("test.bam"),
Some(FileFormat::Bam)
);
assert_eq!(
detect_format_from_filename("test.dict"),
Some(FileFormat::Dict)
);
assert_eq!(
detect_format_from_filename("test.vcf"),
Some(FileFormat::Vcf)
);
assert_eq!(
detect_format_from_filename("test.vcf.gz"),
Some(FileFormat::Vcf)
);
assert_eq!(
detect_format_from_filename("assembly_report.txt"),
Some(FileFormat::NcbiReport)
);
assert_eq!(
detect_format_from_filename("reference.fai"),
Some(FileFormat::Fai)
);
assert_eq!(
detect_format_from_filename("reference.fa"),
Some(FileFormat::Fasta)
);
assert_eq!(
detect_format_from_filename("reference.fasta"),
Some(FileFormat::Fasta)
);
assert_eq!(
detect_format_from_filename("reference.fa.gz"),
Some(FileFormat::Fasta)
);
assert_eq!(
detect_format_from_filename("reference.fasta.gz"),
Some(FileFormat::Fasta)
);
assert_eq!(detect_format_from_filename("unknown.xyz"), None);
}
#[test]
fn test_sam_header_detection() {
let content = "@SQ\tSN:chr1\tLN:248956422\tM5:6aef897c3d6ff0c78aff06ac189178dd\n";
assert_eq!(detect_format_from_content(content), Ok(FileFormat::Sam));
}
#[test]
fn test_dict_detection() {
let content = "@HD\tVN:1.0\tSO:coordinate\n@SQ\tSN:chr1\tLN:248956422\tM5:abc123\n";
assert_eq!(detect_format_from_content(content), Ok(FileFormat::Dict));
}
#[test]
fn test_vcf_detection() {
let content = "##fileformat=VCFv4.2\n##contig=<ID=chr1,length=248956422>\n";
assert_eq!(detect_format_from_content(content), Ok(FileFormat::Vcf));
}
#[test]
fn test_ncbi_report_detection() {
let content =
"# Sequence-Name\tSequence-Role\tAssigned-Molecule\tAssigned-Molecule-Location/Type\n";
assert_eq!(
detect_format_from_content(content),
Ok(FileFormat::NcbiReport)
);
}
#[test]
fn test_fai_detection() {
let content = "chr1\t248956422\t112\t70\t71\nchr2\t242193529\t253404903\t70\t71\n";
assert_eq!(detect_format_from_content(content), Ok(FileFormat::Fai));
}
#[test]
fn test_fai_validation() {
assert!(validate_format_content(
"chr1\t248956422\t112\t70\t71",
&FileFormat::Fai
));
assert!(!validate_format_content(
"chr1\t248956422\t112",
&FileFormat::Fai
));
}
#[test]
fn test_format_validation() {
assert!(validate_format_content(
"@SQ\tSN:chr1\tLN:123",
&FileFormat::Sam
));
assert!(!validate_format_content("random text", &FileFormat::Sam));
assert!(validate_format_content(
"##contig=<ID=chr1>",
&FileFormat::Vcf
));
assert!(!validate_format_content("@SQ\tSN:chr1", &FileFormat::Vcf));
}
#[test]
fn test_sam_header_detection_with_spaces() {
let content = "@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd\n";
assert_eq!(detect_format_from_content(content), Ok(FileFormat::Sam));
}
#[test]
fn test_dict_detection_with_spaces() {
let content = "@HD VN:1.0 SO:coordinate\n@SQ SN:chr1 LN:248956422\n";
assert_eq!(detect_format_from_content(content), Ok(FileFormat::Dict));
}
#[test]
fn test_sam_validation_with_spaces() {
assert!(validate_format_content(
"@SQ SN:chr1 LN:123",
&FileFormat::Sam
));
}
#[test]
fn test_combined_detection() {
let content = "@SQ\tSN:chr1\tLN:248956422\n";
assert_eq!(
detect_format(content, Some("test.sam")),
Ok(FileFormat::Sam)
);
assert_eq!(
detect_format(content, Some("test.dict")),
Ok(FileFormat::Sam)
); assert_eq!(detect_format(content, None), Ok(FileFormat::Sam));
}
#[test]
fn test_parse_binary_file_bam_from_bytes() {
use noodles::bam;
use noodles::sam;
use noodles::sam::header::record::value::map::{Map, ReferenceSequence};
use std::num::NonZeroUsize;
let header = sam::Header::builder()
.add_reference_sequence(
"chr1",
Map::<ReferenceSequence>::new(NonZeroUsize::new(248_956_422).unwrap()),
)
.build();
let mut bam_bytes = Vec::new();
{
let mut writer = bam::io::Writer::new(&mut bam_bytes);
writer.write_header(&header).unwrap();
}
let query = parse_binary_file(&bam_bytes, FileFormat::Bam).unwrap();
assert_eq!(query.contigs.len(), 1);
assert_eq!(query.contigs[0].name, "chr1");
assert_eq!(query.contigs[0].length, 248_956_422);
}
fn build_bam_bytes(header_text: &str) -> Vec<u8> {
use noodles::bam;
use noodles::sam;
let mut reader = sam::io::Reader::new(header_text.as_bytes());
let header = reader.read_header().unwrap();
let mut buf = Vec::new();
{
let mut writer = bam::io::Writer::new(&mut buf);
writer.write_header(&header).unwrap();
}
buf
}
#[test]
fn test_parse_binary_file_from_path_bam() {
use std::io::Write;
use tempfile::NamedTempFile;
let bam_bytes = build_bam_bytes(
"@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n@SQ\tSN:chr2\tLN:242193529\n",
);
let mut temp = NamedTempFile::with_suffix(".bam").unwrap();
temp.write_all(&bam_bytes).unwrap();
let result = parse_binary_file_from_path(temp.path(), FileFormat::Bam);
assert!(result.is_ok());
let query = result.unwrap();
assert_eq!(query.contigs.len(), 2);
assert_eq!(query.contigs[0].name, "chr1");
assert_eq!(query.contigs[0].length, 248_956_422);
assert_eq!(query.contigs[1].name, "chr2");
assert_eq!(query.contigs[1].length, 242_193_529);
}
#[test]
fn test_parse_binary_file_from_path_truncated_bam() {
use std::io::Write;
use tempfile::NamedTempFile;
let mut bam_bytes = build_bam_bytes("@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n");
bam_bytes.extend_from_slice(&[0u8; 1024]);
let mut temp = NamedTempFile::with_suffix(".bam").unwrap();
temp.write_all(&bam_bytes).unwrap();
let result = parse_binary_file_from_path(temp.path(), FileFormat::Bam);
assert!(result.is_ok());
let query = result.unwrap();
assert_eq!(query.contigs.len(), 1);
assert_eq!(query.contigs[0].name, "chr1");
}
#[test]
fn test_parse_binary_file_from_path_unsupported_format() {
use std::io::Write;
use tempfile::NamedTempFile;
let mut temp = NamedTempFile::with_suffix(".txt").unwrap();
temp.write_all(b"not a binary file").unwrap();
let result = parse_binary_file_from_path(temp.path(), FileFormat::Sam);
assert!(result.is_err());
}
}