Skip to main content

genomicframe_core/
schema.rs

1//! Schema system for genomic data formats
2//!
3//! This module provides metadata about the structure of different genomic formats,
4//! enabling introspection, validation, and type-safe query building.
5
6use std::collections::HashMap;
7
8// ============================================================================
9// Core Schema Types
10// ============================================================================
11
12/// Schema for a genomic data format
13///
14/// Describes the columns/fields available in a format and their types.
15#[derive(Debug, Clone)]
16pub struct GenomicSchema {
17    /// The file format this schema describes
18    pub format: FileFormat,
19
20    /// Columns in this format
21    pub columns: Vec<ColumnDef>,
22
23    /// Quick lookup by column name
24    column_map: HashMap<String, usize>,
25}
26
27/// Definition of a single column/field
28#[derive(Debug, Clone, PartialEq)]
29pub struct ColumnDef {
30    /// Column name (e.g., "qual", "chrom", "pos")
31    pub name: String,
32
33    /// Data type of this column
34    pub dtype: DataType,
35
36    /// Optional genomic-specific type information
37    pub genomic_type: Option<GenomicType>,
38
39    /// Whether this column is nullable
40    pub nullable: bool,
41
42    /// Human-readable description
43    pub description: String,
44}
45
46/// Supported file formats
47#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
48pub enum FileFormat {
49    Vcf,
50    Bam,
51    Sam,
52    Bed,
53    Fastq,
54    Fasta,
55    Gff,
56}
57
58/// Generic data types
59#[derive(Debug, Clone, PartialEq)]
60pub enum DataType {
61    /// Boolean (true/false)
62    Boolean,
63
64    /// 32-bit integer
65    Int32,
66
67    /// 64-bit integer
68    Int64,
69
70    /// 32-bit float
71    Float32,
72
73    /// 64-bit float
74    Float64,
75
76    /// UTF-8 string
77    String,
78
79    /// List of values
80    List(Box<DataType>),
81
82    /// Struct with named fields
83    Struct(Vec<ColumnDef>),
84}
85
86/// Genomic-specific type information
87///
88/// Provides semantic meaning to columns for genomic operations
89#[derive(Debug, Clone, Copy, PartialEq, Eq)]
90pub enum GenomicType {
91    /// Chromosome/contig name
92    Chromosome,
93
94    /// Genomic position (1-based)
95    Position,
96
97    /// Quality score (Phred-scaled)
98    Quality,
99
100    /// Reference allele
101    ReferenceAllele,
102
103    /// Alternate allele(s)
104    AlternateAllele,
105
106    /// Filter status (PASS, FAIL, etc.)
107    Filter,
108
109    /// Strand (+/-)
110    Strand,
111
112    /// Mapping quality (BAM)
113    MappingQuality,
114
115    /// CIGAR string (BAM)
116    Cigar,
117
118    /// DNA sequence
119    Sequence,
120
121    /// Base quality scores
122    BaseQuality,
123}
124
125// ============================================================================
126// Schema Implementation
127// ============================================================================
128
129impl GenomicSchema {
130    /// Create a new schema
131    pub fn new(format: FileFormat, columns: Vec<ColumnDef>) -> Self {
132        let mut column_map = HashMap::new();
133        for (idx, col) in columns.iter().enumerate() {
134            column_map.insert(col.name.clone(), idx);
135        }
136
137        Self {
138            format,
139            columns,
140            column_map,
141        }
142    }
143
144    /// Get column by name
145    pub fn column(&self, name: &str) -> Option<&ColumnDef> {
146        self.column_map.get(name).map(|&idx| &self.columns[idx])
147    }
148
149    /// Check if column exists
150    pub fn has_column(&self, name: &str) -> bool {
151        self.column_map.contains_key(name)
152    }
153
154    /// Get all column names
155    pub fn column_names(&self) -> Vec<&str> {
156        self.columns.iter().map(|c| c.name.as_str()).collect()
157    }
158}
159
160// ============================================================================
161// Format-Specific Schemas
162// ============================================================================
163
164impl GenomicSchema {
165    /// VCF format schema
166    pub fn vcf() -> Self {
167        Self::new(
168            FileFormat::Vcf,
169            vec![
170                ColumnDef {
171                    name: "chrom".to_string(),
172                    dtype: DataType::String,
173                    genomic_type: Some(GenomicType::Chromosome),
174                    nullable: false,
175                    description: "Chromosome name".to_string(),
176                },
177                ColumnDef {
178                    name: "pos".to_string(),
179                    dtype: DataType::Int64,
180                    genomic_type: Some(GenomicType::Position),
181                    nullable: false,
182                    description: "1-based position".to_string(),
183                },
184                ColumnDef {
185                    name: "id".to_string(),
186                    dtype: DataType::String,
187                    genomic_type: None,
188                    nullable: true,
189                    description: "Variant ID".to_string(),
190                },
191                ColumnDef {
192                    name: "ref".to_string(),
193                    dtype: DataType::String,
194                    genomic_type: Some(GenomicType::ReferenceAllele),
195                    nullable: false,
196                    description: "Reference allele".to_string(),
197                },
198                ColumnDef {
199                    name: "alt".to_string(),
200                    dtype: DataType::List(Box::new(DataType::String)),
201                    genomic_type: Some(GenomicType::AlternateAllele),
202                    nullable: false,
203                    description: "Alternate allele(s)".to_string(),
204                },
205                ColumnDef {
206                    name: "qual".to_string(),
207                    dtype: DataType::Float64,
208                    genomic_type: Some(GenomicType::Quality),
209                    nullable: true,
210                    description: "Quality score (Phred-scaled)".to_string(),
211                },
212                ColumnDef {
213                    name: "filter".to_string(),
214                    dtype: DataType::String,
215                    genomic_type: Some(GenomicType::Filter),
216                    nullable: false,
217                    description: "Filter status (PASS, FAIL, etc.)".to_string(),
218                },
219            ],
220        )
221    }
222
223    /// BAM format schema
224    pub fn bam() -> Self {
225        Self::new(
226            FileFormat::Bam,
227            vec![
228                ColumnDef {
229                    name: "qname".to_string(),
230                    dtype: DataType::String,
231                    genomic_type: None,
232                    nullable: false,
233                    description: "Query name".to_string(),
234                },
235                ColumnDef {
236                    name: "flag".to_string(),
237                    dtype: DataType::Int32,
238                    genomic_type: None,
239                    nullable: false,
240                    description: "Bitwise flags".to_string(),
241                },
242                ColumnDef {
243                    name: "rname".to_string(),
244                    dtype: DataType::String,
245                    genomic_type: Some(GenomicType::Chromosome),
246                    nullable: true,
247                    description: "Reference sequence name".to_string(),
248                },
249                ColumnDef {
250                    name: "pos".to_string(),
251                    dtype: DataType::Int64,
252                    genomic_type: Some(GenomicType::Position),
253                    nullable: false,
254                    description: "1-based leftmost position".to_string(),
255                },
256                ColumnDef {
257                    name: "mapq".to_string(),
258                    dtype: DataType::Int32,
259                    genomic_type: Some(GenomicType::MappingQuality),
260                    nullable: false,
261                    description: "Mapping quality".to_string(),
262                },
263                ColumnDef {
264                    name: "cigar".to_string(),
265                    dtype: DataType::String,
266                    genomic_type: Some(GenomicType::Cigar),
267                    nullable: true,
268                    description: "CIGAR string".to_string(),
269                },
270                ColumnDef {
271                    name: "seq".to_string(),
272                    dtype: DataType::String,
273                    genomic_type: Some(GenomicType::Sequence),
274                    nullable: false,
275                    description: "Read sequence".to_string(),
276                },
277                ColumnDef {
278                    name: "qual".to_string(),
279                    dtype: DataType::String,
280                    genomic_type: Some(GenomicType::BaseQuality),
281                    nullable: true,
282                    description: "Base quality scores".to_string(),
283                },
284            ],
285        )
286    }
287
288    /// BED format schema
289    pub fn bed() -> Self {
290        Self::new(
291            FileFormat::Bed,
292            vec![
293                ColumnDef {
294                    name: "chrom".to_string(),
295                    dtype: DataType::String,
296                    genomic_type: Some(GenomicType::Chromosome),
297                    nullable: false,
298                    description: "Chromosome name".to_string(),
299                },
300                ColumnDef {
301                    name: "start".to_string(),
302                    dtype: DataType::Int64,
303                    genomic_type: Some(GenomicType::Position),
304                    nullable: false,
305                    description: "0-based start position".to_string(),
306                },
307                ColumnDef {
308                    name: "end".to_string(),
309                    dtype: DataType::Int64,
310                    genomic_type: Some(GenomicType::Position),
311                    nullable: false,
312                    description: "End position (exclusive)".to_string(),
313                },
314                ColumnDef {
315                    name: "name".to_string(),
316                    dtype: DataType::String,
317                    genomic_type: None,
318                    nullable: true,
319                    description: "Feature name".to_string(),
320                },
321                ColumnDef {
322                    name: "score".to_string(),
323                    dtype: DataType::Float64,
324                    genomic_type: None,
325                    nullable: true,
326                    description: "Score (0-1000)".to_string(),
327                },
328                ColumnDef {
329                    name: "strand".to_string(),
330                    dtype: DataType::String,
331                    genomic_type: Some(GenomicType::Strand),
332                    nullable: true,
333                    description: "Strand (+/-)".to_string(),
334                },
335            ],
336        )
337    }
338
339    /// FASTQ format schema
340    pub fn fastq() -> Self {
341        Self::new(
342            FileFormat::Fastq,
343            vec![
344                ColumnDef {
345                    name: "id".to_string(),
346                    dtype: DataType::String,
347                    genomic_type: None,
348                    nullable: false,
349                    description: "Read identifier".to_string(),
350                },
351                ColumnDef {
352                    name: "sequence".to_string(),
353                    dtype: DataType::String,
354                    genomic_type: Some(GenomicType::Sequence),
355                    nullable: false,
356                    description: "Read sequence".to_string(),
357                },
358                ColumnDef {
359                    name: "quality".to_string(),
360                    dtype: DataType::String,
361                    genomic_type: Some(GenomicType::BaseQuality),
362                    nullable: false,
363                    description: "Base quality scores (Phred+33)".to_string(),
364                },
365            ],
366        )
367    }
368}
369
370// ============================================================================
371// Helper Functions
372// ============================================================================
373
374impl FileFormat {
375    /// Get the default schema for this format
376    pub fn schema(&self) -> GenomicSchema {
377        match self {
378            FileFormat::Vcf => GenomicSchema::vcf(),
379            FileFormat::Bam | FileFormat::Sam => GenomicSchema::bam(),
380            FileFormat::Bed => GenomicSchema::bed(),
381            FileFormat::Fastq => GenomicSchema::fastq(),
382            FileFormat::Fasta => {
383                // Minimal FASTA schema
384                GenomicSchema::new(
385                    FileFormat::Fasta,
386                    vec![
387                        ColumnDef {
388                            name: "id".to_string(),
389                            dtype: DataType::String,
390                            genomic_type: None,
391                            nullable: false,
392                            description: "Sequence identifier".to_string(),
393                        },
394                        ColumnDef {
395                            name: "sequence".to_string(),
396                            dtype: DataType::String,
397                            genomic_type: Some(GenomicType::Sequence),
398                            nullable: false,
399                            description: "Sequence".to_string(),
400                        },
401                    ],
402                )
403            }
404            FileFormat::Gff => {
405                // GFF schema
406                GenomicSchema::new(
407                    FileFormat::Gff,
408                    vec![
409                        ColumnDef {
410                            name: "seqid".to_string(),
411                            dtype: DataType::String,
412                            genomic_type: Some(GenomicType::Chromosome),
413                            nullable: false,
414                            description: "Sequence ID".to_string(),
415                        },
416                        ColumnDef {
417                            name: "source".to_string(),
418                            dtype: DataType::String,
419                            genomic_type: None,
420                            nullable: true,
421                            description: "Source".to_string(),
422                        },
423                        ColumnDef {
424                            name: "type".to_string(),
425                            dtype: DataType::String,
426                            genomic_type: None,
427                            nullable: false,
428                            description: "Feature type".to_string(),
429                        },
430                        ColumnDef {
431                            name: "start".to_string(),
432                            dtype: DataType::Int64,
433                            genomic_type: Some(GenomicType::Position),
434                            nullable: false,
435                            description: "1-based start position".to_string(),
436                        },
437                        ColumnDef {
438                            name: "end".to_string(),
439                            dtype: DataType::Int64,
440                            genomic_type: Some(GenomicType::Position),
441                            nullable: false,
442                            description: "End position (inclusive)".to_string(),
443                        },
444                        ColumnDef {
445                            name: "score".to_string(),
446                            dtype: DataType::Float64,
447                            genomic_type: None,
448                            nullable: true,
449                            description: "Score".to_string(),
450                        },
451                        ColumnDef {
452                            name: "strand".to_string(),
453                            dtype: DataType::String,
454                            genomic_type: Some(GenomicType::Strand),
455                            nullable: true,
456                            description: "Strand (+/-/.)".to_string(),
457                        },
458                    ],
459                )
460            }
461        }
462    }
463}
464
465#[cfg(test)]
466mod tests {
467    use super::*;
468
469    #[test]
470    fn test_vcf_schema() {
471        let schema = GenomicSchema::vcf();
472        assert_eq!(schema.format, FileFormat::Vcf);
473        assert!(schema.has_column("chrom"));
474        assert!(schema.has_column("qual"));
475        assert!(!schema.has_column("invalid"));
476
477        let qual_col = schema.column("qual").unwrap();
478        assert_eq!(qual_col.dtype, DataType::Float64);
479        assert_eq!(qual_col.genomic_type, Some(GenomicType::Quality));
480    }
481
482    #[test]
483    fn test_bam_schema() {
484        let schema = GenomicSchema::bam();
485        assert_eq!(schema.format, FileFormat::Bam);
486        assert!(schema.has_column("mapq"));
487
488        let mapq_col = schema.column("mapq").unwrap();
489        assert_eq!(mapq_col.genomic_type, Some(GenomicType::MappingQuality));
490    }
491
492    #[test]
493    fn test_bed_schema() {
494        let schema = GenomicSchema::bed();
495        assert_eq!(schema.column_names().len(), 6);
496        assert!(schema.has_column("chrom"));
497        assert!(schema.has_column("start"));
498        assert!(schema.has_column("end"));
499    }
500
501    #[test]
502    fn test_fastq_schema() {
503        let schema = GenomicSchema::fastq();
504        assert_eq!(schema.columns.len(), 3);
505
506        let seq_col = schema.column("sequence").unwrap();
507        assert_eq!(seq_col.genomic_type, Some(GenomicType::Sequence));
508    }
509
510    #[test]
511    fn test_format_schema_method() {
512        let vcf_schema = FileFormat::Vcf.schema();
513        assert_eq!(vcf_schema.format, FileFormat::Vcf);
514
515        let bam_schema = FileFormat::Bam.schema();
516        assert_eq!(bam_schema.format, FileFormat::Bam);
517    }
518}