use std::collections::HashMap;
#[derive(Debug, Clone)]
pub struct GenomicSchema {
pub format: FileFormat,
pub columns: Vec<ColumnDef>,
column_map: HashMap<String, usize>,
}
#[derive(Debug, Clone, PartialEq)]
pub struct ColumnDef {
pub name: String,
pub dtype: DataType,
pub genomic_type: Option<GenomicType>,
pub nullable: bool,
pub description: String,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum FileFormat {
Vcf,
Bam,
Sam,
Bed,
Fastq,
Fasta,
Gff,
}
#[derive(Debug, Clone, PartialEq)]
pub enum DataType {
Boolean,
Int32,
Int64,
Float32,
Float64,
String,
List(Box<DataType>),
Struct(Vec<ColumnDef>),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum GenomicType {
Chromosome,
Position,
Quality,
ReferenceAllele,
AlternateAllele,
Filter,
Strand,
MappingQuality,
Cigar,
Sequence,
BaseQuality,
}
impl GenomicSchema {
pub fn new(format: FileFormat, columns: Vec<ColumnDef>) -> Self {
let mut column_map = HashMap::new();
for (idx, col) in columns.iter().enumerate() {
column_map.insert(col.name.clone(), idx);
}
Self {
format,
columns,
column_map,
}
}
pub fn column(&self, name: &str) -> Option<&ColumnDef> {
self.column_map.get(name).map(|&idx| &self.columns[idx])
}
pub fn has_column(&self, name: &str) -> bool {
self.column_map.contains_key(name)
}
pub fn column_names(&self) -> Vec<&str> {
self.columns.iter().map(|c| c.name.as_str()).collect()
}
}
impl GenomicSchema {
pub fn vcf() -> Self {
Self::new(
FileFormat::Vcf,
vec![
ColumnDef {
name: "chrom".to_string(),
dtype: DataType::String,
genomic_type: Some(GenomicType::Chromosome),
nullable: false,
description: "Chromosome name".to_string(),
},
ColumnDef {
name: "pos".to_string(),
dtype: DataType::Int64,
genomic_type: Some(GenomicType::Position),
nullable: false,
description: "1-based position".to_string(),
},
ColumnDef {
name: "id".to_string(),
dtype: DataType::String,
genomic_type: None,
nullable: true,
description: "Variant ID".to_string(),
},
ColumnDef {
name: "ref".to_string(),
dtype: DataType::String,
genomic_type: Some(GenomicType::ReferenceAllele),
nullable: false,
description: "Reference allele".to_string(),
},
ColumnDef {
name: "alt".to_string(),
dtype: DataType::List(Box::new(DataType::String)),
genomic_type: Some(GenomicType::AlternateAllele),
nullable: false,
description: "Alternate allele(s)".to_string(),
},
ColumnDef {
name: "qual".to_string(),
dtype: DataType::Float64,
genomic_type: Some(GenomicType::Quality),
nullable: true,
description: "Quality score (Phred-scaled)".to_string(),
},
ColumnDef {
name: "filter".to_string(),
dtype: DataType::String,
genomic_type: Some(GenomicType::Filter),
nullable: false,
description: "Filter status (PASS, FAIL, etc.)".to_string(),
},
],
)
}
pub fn bam() -> Self {
Self::new(
FileFormat::Bam,
vec![
ColumnDef {
name: "qname".to_string(),
dtype: DataType::String,
genomic_type: None,
nullable: false,
description: "Query name".to_string(),
},
ColumnDef {
name: "flag".to_string(),
dtype: DataType::Int32,
genomic_type: None,
nullable: false,
description: "Bitwise flags".to_string(),
},
ColumnDef {
name: "rname".to_string(),
dtype: DataType::String,
genomic_type: Some(GenomicType::Chromosome),
nullable: true,
description: "Reference sequence name".to_string(),
},
ColumnDef {
name: "pos".to_string(),
dtype: DataType::Int64,
genomic_type: Some(GenomicType::Position),
nullable: false,
description: "1-based leftmost position".to_string(),
},
ColumnDef {
name: "mapq".to_string(),
dtype: DataType::Int32,
genomic_type: Some(GenomicType::MappingQuality),
nullable: false,
description: "Mapping quality".to_string(),
},
ColumnDef {
name: "cigar".to_string(),
dtype: DataType::String,
genomic_type: Some(GenomicType::Cigar),
nullable: true,
description: "CIGAR string".to_string(),
},
ColumnDef {
name: "seq".to_string(),
dtype: DataType::String,
genomic_type: Some(GenomicType::Sequence),
nullable: false,
description: "Read sequence".to_string(),
},
ColumnDef {
name: "qual".to_string(),
dtype: DataType::String,
genomic_type: Some(GenomicType::BaseQuality),
nullable: true,
description: "Base quality scores".to_string(),
},
],
)
}
pub fn bed() -> Self {
Self::new(
FileFormat::Bed,
vec![
ColumnDef {
name: "chrom".to_string(),
dtype: DataType::String,
genomic_type: Some(GenomicType::Chromosome),
nullable: false,
description: "Chromosome name".to_string(),
},
ColumnDef {
name: "start".to_string(),
dtype: DataType::Int64,
genomic_type: Some(GenomicType::Position),
nullable: false,
description: "0-based start position".to_string(),
},
ColumnDef {
name: "end".to_string(),
dtype: DataType::Int64,
genomic_type: Some(GenomicType::Position),
nullable: false,
description: "End position (exclusive)".to_string(),
},
ColumnDef {
name: "name".to_string(),
dtype: DataType::String,
genomic_type: None,
nullable: true,
description: "Feature name".to_string(),
},
ColumnDef {
name: "score".to_string(),
dtype: DataType::Float64,
genomic_type: None,
nullable: true,
description: "Score (0-1000)".to_string(),
},
ColumnDef {
name: "strand".to_string(),
dtype: DataType::String,
genomic_type: Some(GenomicType::Strand),
nullable: true,
description: "Strand (+/-)".to_string(),
},
],
)
}
pub fn fastq() -> Self {
Self::new(
FileFormat::Fastq,
vec![
ColumnDef {
name: "id".to_string(),
dtype: DataType::String,
genomic_type: None,
nullable: false,
description: "Read identifier".to_string(),
},
ColumnDef {
name: "sequence".to_string(),
dtype: DataType::String,
genomic_type: Some(GenomicType::Sequence),
nullable: false,
description: "Read sequence".to_string(),
},
ColumnDef {
name: "quality".to_string(),
dtype: DataType::String,
genomic_type: Some(GenomicType::BaseQuality),
nullable: false,
description: "Base quality scores (Phred+33)".to_string(),
},
],
)
}
}
impl FileFormat {
pub fn schema(&self) -> GenomicSchema {
match self {
FileFormat::Vcf => GenomicSchema::vcf(),
FileFormat::Bam | FileFormat::Sam => GenomicSchema::bam(),
FileFormat::Bed => GenomicSchema::bed(),
FileFormat::Fastq => GenomicSchema::fastq(),
FileFormat::Fasta => {
GenomicSchema::new(
FileFormat::Fasta,
vec![
ColumnDef {
name: "id".to_string(),
dtype: DataType::String,
genomic_type: None,
nullable: false,
description: "Sequence identifier".to_string(),
},
ColumnDef {
name: "sequence".to_string(),
dtype: DataType::String,
genomic_type: Some(GenomicType::Sequence),
nullable: false,
description: "Sequence".to_string(),
},
],
)
}
FileFormat::Gff => {
GenomicSchema::new(
FileFormat::Gff,
vec![
ColumnDef {
name: "seqid".to_string(),
dtype: DataType::String,
genomic_type: Some(GenomicType::Chromosome),
nullable: false,
description: "Sequence ID".to_string(),
},
ColumnDef {
name: "source".to_string(),
dtype: DataType::String,
genomic_type: None,
nullable: true,
description: "Source".to_string(),
},
ColumnDef {
name: "type".to_string(),
dtype: DataType::String,
genomic_type: None,
nullable: false,
description: "Feature type".to_string(),
},
ColumnDef {
name: "start".to_string(),
dtype: DataType::Int64,
genomic_type: Some(GenomicType::Position),
nullable: false,
description: "1-based start position".to_string(),
},
ColumnDef {
name: "end".to_string(),
dtype: DataType::Int64,
genomic_type: Some(GenomicType::Position),
nullable: false,
description: "End position (inclusive)".to_string(),
},
ColumnDef {
name: "score".to_string(),
dtype: DataType::Float64,
genomic_type: None,
nullable: true,
description: "Score".to_string(),
},
ColumnDef {
name: "strand".to_string(),
dtype: DataType::String,
genomic_type: Some(GenomicType::Strand),
nullable: true,
description: "Strand (+/-/.)".to_string(),
},
],
)
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_vcf_schema() {
let schema = GenomicSchema::vcf();
assert_eq!(schema.format, FileFormat::Vcf);
assert!(schema.has_column("chrom"));
assert!(schema.has_column("qual"));
assert!(!schema.has_column("invalid"));
let qual_col = schema.column("qual").unwrap();
assert_eq!(qual_col.dtype, DataType::Float64);
assert_eq!(qual_col.genomic_type, Some(GenomicType::Quality));
}
#[test]
fn test_bam_schema() {
let schema = GenomicSchema::bam();
assert_eq!(schema.format, FileFormat::Bam);
assert!(schema.has_column("mapq"));
let mapq_col = schema.column("mapq").unwrap();
assert_eq!(mapq_col.genomic_type, Some(GenomicType::MappingQuality));
}
#[test]
fn test_bed_schema() {
let schema = GenomicSchema::bed();
assert_eq!(schema.column_names().len(), 6);
assert!(schema.has_column("chrom"));
assert!(schema.has_column("start"));
assert!(schema.has_column("end"));
}
#[test]
fn test_fastq_schema() {
let schema = GenomicSchema::fastq();
assert_eq!(schema.columns.len(), 3);
let seq_col = schema.column("sequence").unwrap();
assert_eq!(seq_col.genomic_type, Some(GenomicType::Sequence));
}
#[test]
fn test_format_schema_method() {
let vcf_schema = FileFormat::Vcf.schema();
assert_eq!(vcf_schema.format, FileFormat::Vcf);
let bam_schema = FileFormat::Bam.schema();
assert_eq!(bam_schema.format, FileFormat::Bam);
}
}