exon_bed/
schema.rs

1// Copyright 2024 WHERE TRUE Technologies.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::sync::Arc;
16
17use arrow::datatypes::{DataType, Field, Schema};
18use exon_common::TableSchema;
19
20use crate::{ExonBEDError, ExonBEDResult};
21
22pub struct BEDSchemaBuilder {
23    file_fields: Vec<Field>,
24    partition_fields: Vec<Field>,
25}
26
27fn file_fields(n_fields: usize) -> ExonBEDResult<Vec<Field>> {
28    if !(3..=12).contains(&n_fields) {
29        return Err(ExonBEDError::InvalidNumberOfFields(n_fields));
30    }
31
32    let field_fields = vec![
33        Field::new("reference_sequence_name", DataType::Utf8, false),
34        Field::new("start", DataType::Int64, false),
35        Field::new("end", DataType::Int64, false),
36        Field::new("name", DataType::Utf8, true),
37        Field::new("score", DataType::Int64, true),
38        Field::new("strand", DataType::Utf8, true),
39        Field::new("thick_start", DataType::Int64, true),
40        Field::new("thick_end", DataType::Int64, true),
41        Field::new("color", DataType::Utf8, true),
42        Field::new("block_count", DataType::Int64, true),
43        Field::new("block_sizes", DataType::Utf8, true),
44        Field::new("block_starts", DataType::Utf8, true),
45    ];
46
47    Ok(field_fields[0..n_fields].to_vec())
48}
49
50impl BEDSchemaBuilder {
51    pub fn new(file_fields: Vec<Field>, partition_fields: Vec<Field>) -> Self {
52        Self {
53            file_fields,
54            partition_fields,
55        }
56    }
57
58    pub fn add_partition_fields(&mut self, fields: Vec<Field>) {
59        self.partition_fields.extend(fields);
60    }
61
62    /// Returns the schema and the projection indexes for the file's schema
63    pub fn build(self) -> TableSchema {
64        let mut fields = self.file_fields.clone();
65        fields.extend_from_slice(&self.partition_fields);
66
67        let schema = Schema::new(fields);
68
69        let projection = (0..self.file_fields.len()).collect::<Vec<_>>();
70
71        TableSchema::new(Arc::new(schema), projection)
72    }
73
74    /// From number of fields, create a schema with default fields
75    pub fn with_n_fields(n_fields: usize) -> ExonBEDResult<Self> {
76        let field_fields = file_fields(n_fields)?;
77
78        Ok(Self::new(field_fields, vec![]))
79    }
80}
81
82impl Default for BEDSchemaBuilder {
83    fn default() -> Self {
84        let field_fields = file_fields(12).unwrap();
85        Self::new(field_fields, vec![])
86    }
87}