1use std::collections::HashMap;
7
8#[derive(Debug, Clone)]
16pub struct GenomicSchema {
17 pub format: FileFormat,
19
20 pub columns: Vec<ColumnDef>,
22
23 column_map: HashMap<String, usize>,
25}
26
27#[derive(Debug, Clone, PartialEq)]
29pub struct ColumnDef {
30 pub name: String,
32
33 pub dtype: DataType,
35
36 pub genomic_type: Option<GenomicType>,
38
39 pub nullable: bool,
41
42 pub description: String,
44}
45
46#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
48pub enum FileFormat {
49 Vcf,
50 Bam,
51 Sam,
52 Bed,
53 Fastq,
54 Fasta,
55 Gff,
56}
57
58#[derive(Debug, Clone, PartialEq)]
60pub enum DataType {
61 Boolean,
63
64 Int32,
66
67 Int64,
69
70 Float32,
72
73 Float64,
75
76 String,
78
79 List(Box<DataType>),
81
82 Struct(Vec<ColumnDef>),
84}
85
86#[derive(Debug, Clone, Copy, PartialEq, Eq)]
90pub enum GenomicType {
91 Chromosome,
93
94 Position,
96
97 Quality,
99
100 ReferenceAllele,
102
103 AlternateAllele,
105
106 Filter,
108
109 Strand,
111
112 MappingQuality,
114
115 Cigar,
117
118 Sequence,
120
121 BaseQuality,
123}
124
125impl GenomicSchema {
130 pub fn new(format: FileFormat, columns: Vec<ColumnDef>) -> Self {
132 let mut column_map = HashMap::new();
133 for (idx, col) in columns.iter().enumerate() {
134 column_map.insert(col.name.clone(), idx);
135 }
136
137 Self {
138 format,
139 columns,
140 column_map,
141 }
142 }
143
144 pub fn column(&self, name: &str) -> Option<&ColumnDef> {
146 self.column_map.get(name).map(|&idx| &self.columns[idx])
147 }
148
149 pub fn has_column(&self, name: &str) -> bool {
151 self.column_map.contains_key(name)
152 }
153
154 pub fn column_names(&self) -> Vec<&str> {
156 self.columns.iter().map(|c| c.name.as_str()).collect()
157 }
158}
159
160impl GenomicSchema {
165 pub fn vcf() -> Self {
167 Self::new(
168 FileFormat::Vcf,
169 vec![
170 ColumnDef {
171 name: "chrom".to_string(),
172 dtype: DataType::String,
173 genomic_type: Some(GenomicType::Chromosome),
174 nullable: false,
175 description: "Chromosome name".to_string(),
176 },
177 ColumnDef {
178 name: "pos".to_string(),
179 dtype: DataType::Int64,
180 genomic_type: Some(GenomicType::Position),
181 nullable: false,
182 description: "1-based position".to_string(),
183 },
184 ColumnDef {
185 name: "id".to_string(),
186 dtype: DataType::String,
187 genomic_type: None,
188 nullable: true,
189 description: "Variant ID".to_string(),
190 },
191 ColumnDef {
192 name: "ref".to_string(),
193 dtype: DataType::String,
194 genomic_type: Some(GenomicType::ReferenceAllele),
195 nullable: false,
196 description: "Reference allele".to_string(),
197 },
198 ColumnDef {
199 name: "alt".to_string(),
200 dtype: DataType::List(Box::new(DataType::String)),
201 genomic_type: Some(GenomicType::AlternateAllele),
202 nullable: false,
203 description: "Alternate allele(s)".to_string(),
204 },
205 ColumnDef {
206 name: "qual".to_string(),
207 dtype: DataType::Float64,
208 genomic_type: Some(GenomicType::Quality),
209 nullable: true,
210 description: "Quality score (Phred-scaled)".to_string(),
211 },
212 ColumnDef {
213 name: "filter".to_string(),
214 dtype: DataType::String,
215 genomic_type: Some(GenomicType::Filter),
216 nullable: false,
217 description: "Filter status (PASS, FAIL, etc.)".to_string(),
218 },
219 ],
220 )
221 }
222
223 pub fn bam() -> Self {
225 Self::new(
226 FileFormat::Bam,
227 vec![
228 ColumnDef {
229 name: "qname".to_string(),
230 dtype: DataType::String,
231 genomic_type: None,
232 nullable: false,
233 description: "Query name".to_string(),
234 },
235 ColumnDef {
236 name: "flag".to_string(),
237 dtype: DataType::Int32,
238 genomic_type: None,
239 nullable: false,
240 description: "Bitwise flags".to_string(),
241 },
242 ColumnDef {
243 name: "rname".to_string(),
244 dtype: DataType::String,
245 genomic_type: Some(GenomicType::Chromosome),
246 nullable: true,
247 description: "Reference sequence name".to_string(),
248 },
249 ColumnDef {
250 name: "pos".to_string(),
251 dtype: DataType::Int64,
252 genomic_type: Some(GenomicType::Position),
253 nullable: false,
254 description: "1-based leftmost position".to_string(),
255 },
256 ColumnDef {
257 name: "mapq".to_string(),
258 dtype: DataType::Int32,
259 genomic_type: Some(GenomicType::MappingQuality),
260 nullable: false,
261 description: "Mapping quality".to_string(),
262 },
263 ColumnDef {
264 name: "cigar".to_string(),
265 dtype: DataType::String,
266 genomic_type: Some(GenomicType::Cigar),
267 nullable: true,
268 description: "CIGAR string".to_string(),
269 },
270 ColumnDef {
271 name: "seq".to_string(),
272 dtype: DataType::String,
273 genomic_type: Some(GenomicType::Sequence),
274 nullable: false,
275 description: "Read sequence".to_string(),
276 },
277 ColumnDef {
278 name: "qual".to_string(),
279 dtype: DataType::String,
280 genomic_type: Some(GenomicType::BaseQuality),
281 nullable: true,
282 description: "Base quality scores".to_string(),
283 },
284 ],
285 )
286 }
287
288 pub fn bed() -> Self {
290 Self::new(
291 FileFormat::Bed,
292 vec![
293 ColumnDef {
294 name: "chrom".to_string(),
295 dtype: DataType::String,
296 genomic_type: Some(GenomicType::Chromosome),
297 nullable: false,
298 description: "Chromosome name".to_string(),
299 },
300 ColumnDef {
301 name: "start".to_string(),
302 dtype: DataType::Int64,
303 genomic_type: Some(GenomicType::Position),
304 nullable: false,
305 description: "0-based start position".to_string(),
306 },
307 ColumnDef {
308 name: "end".to_string(),
309 dtype: DataType::Int64,
310 genomic_type: Some(GenomicType::Position),
311 nullable: false,
312 description: "End position (exclusive)".to_string(),
313 },
314 ColumnDef {
315 name: "name".to_string(),
316 dtype: DataType::String,
317 genomic_type: None,
318 nullable: true,
319 description: "Feature name".to_string(),
320 },
321 ColumnDef {
322 name: "score".to_string(),
323 dtype: DataType::Float64,
324 genomic_type: None,
325 nullable: true,
326 description: "Score (0-1000)".to_string(),
327 },
328 ColumnDef {
329 name: "strand".to_string(),
330 dtype: DataType::String,
331 genomic_type: Some(GenomicType::Strand),
332 nullable: true,
333 description: "Strand (+/-)".to_string(),
334 },
335 ],
336 )
337 }
338
339 pub fn fastq() -> Self {
341 Self::new(
342 FileFormat::Fastq,
343 vec![
344 ColumnDef {
345 name: "id".to_string(),
346 dtype: DataType::String,
347 genomic_type: None,
348 nullable: false,
349 description: "Read identifier".to_string(),
350 },
351 ColumnDef {
352 name: "sequence".to_string(),
353 dtype: DataType::String,
354 genomic_type: Some(GenomicType::Sequence),
355 nullable: false,
356 description: "Read sequence".to_string(),
357 },
358 ColumnDef {
359 name: "quality".to_string(),
360 dtype: DataType::String,
361 genomic_type: Some(GenomicType::BaseQuality),
362 nullable: false,
363 description: "Base quality scores (Phred+33)".to_string(),
364 },
365 ],
366 )
367 }
368}
369
370impl FileFormat {
375 pub fn schema(&self) -> GenomicSchema {
377 match self {
378 FileFormat::Vcf => GenomicSchema::vcf(),
379 FileFormat::Bam | FileFormat::Sam => GenomicSchema::bam(),
380 FileFormat::Bed => GenomicSchema::bed(),
381 FileFormat::Fastq => GenomicSchema::fastq(),
382 FileFormat::Fasta => {
383 GenomicSchema::new(
385 FileFormat::Fasta,
386 vec![
387 ColumnDef {
388 name: "id".to_string(),
389 dtype: DataType::String,
390 genomic_type: None,
391 nullable: false,
392 description: "Sequence identifier".to_string(),
393 },
394 ColumnDef {
395 name: "sequence".to_string(),
396 dtype: DataType::String,
397 genomic_type: Some(GenomicType::Sequence),
398 nullable: false,
399 description: "Sequence".to_string(),
400 },
401 ],
402 )
403 }
404 FileFormat::Gff => {
405 GenomicSchema::new(
407 FileFormat::Gff,
408 vec![
409 ColumnDef {
410 name: "seqid".to_string(),
411 dtype: DataType::String,
412 genomic_type: Some(GenomicType::Chromosome),
413 nullable: false,
414 description: "Sequence ID".to_string(),
415 },
416 ColumnDef {
417 name: "source".to_string(),
418 dtype: DataType::String,
419 genomic_type: None,
420 nullable: true,
421 description: "Source".to_string(),
422 },
423 ColumnDef {
424 name: "type".to_string(),
425 dtype: DataType::String,
426 genomic_type: None,
427 nullable: false,
428 description: "Feature type".to_string(),
429 },
430 ColumnDef {
431 name: "start".to_string(),
432 dtype: DataType::Int64,
433 genomic_type: Some(GenomicType::Position),
434 nullable: false,
435 description: "1-based start position".to_string(),
436 },
437 ColumnDef {
438 name: "end".to_string(),
439 dtype: DataType::Int64,
440 genomic_type: Some(GenomicType::Position),
441 nullable: false,
442 description: "End position (inclusive)".to_string(),
443 },
444 ColumnDef {
445 name: "score".to_string(),
446 dtype: DataType::Float64,
447 genomic_type: None,
448 nullable: true,
449 description: "Score".to_string(),
450 },
451 ColumnDef {
452 name: "strand".to_string(),
453 dtype: DataType::String,
454 genomic_type: Some(GenomicType::Strand),
455 nullable: true,
456 description: "Strand (+/-/.)".to_string(),
457 },
458 ],
459 )
460 }
461 }
462 }
463}
464
465#[cfg(test)]
466mod tests {
467 use super::*;
468
469 #[test]
470 fn test_vcf_schema() {
471 let schema = GenomicSchema::vcf();
472 assert_eq!(schema.format, FileFormat::Vcf);
473 assert!(schema.has_column("chrom"));
474 assert!(schema.has_column("qual"));
475 assert!(!schema.has_column("invalid"));
476
477 let qual_col = schema.column("qual").unwrap();
478 assert_eq!(qual_col.dtype, DataType::Float64);
479 assert_eq!(qual_col.genomic_type, Some(GenomicType::Quality));
480 }
481
482 #[test]
483 fn test_bam_schema() {
484 let schema = GenomicSchema::bam();
485 assert_eq!(schema.format, FileFormat::Bam);
486 assert!(schema.has_column("mapq"));
487
488 let mapq_col = schema.column("mapq").unwrap();
489 assert_eq!(mapq_col.genomic_type, Some(GenomicType::MappingQuality));
490 }
491
492 #[test]
493 fn test_bed_schema() {
494 let schema = GenomicSchema::bed();
495 assert_eq!(schema.column_names().len(), 6);
496 assert!(schema.has_column("chrom"));
497 assert!(schema.has_column("start"));
498 assert!(schema.has_column("end"));
499 }
500
501 #[test]
502 fn test_fastq_schema() {
503 let schema = GenomicSchema::fastq();
504 assert_eq!(schema.columns.len(), 3);
505
506 let seq_col = schema.column("sequence").unwrap();
507 assert_eq!(seq_col.genomic_type, Some(GenomicType::Sequence));
508 }
509
510 #[test]
511 fn test_format_schema_method() {
512 let vcf_schema = FileFormat::Vcf.schema();
513 assert_eq!(vcf_schema.format, FileFormat::Vcf);
514
515 let bam_schema = FileFormat::Bam.schema();
516 assert_eq!(bam_schema.format, FileFormat::Bam);
517 }
518}