exon_gtf/
config.rs

1// Copyright 2023 WHERE TRUE Technologies.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::sync::Arc;
16
17use arrow::datatypes::{DataType, Field, SchemaRef};
18use exon_common::{TableSchemaBuilder, DEFAULT_BATCH_SIZE};
19use object_store::ObjectStore;
20
21pub fn new_gtf_schema_builder() -> TableSchemaBuilder {
22    let file_fields = file_fields();
23    TableSchemaBuilder::new_with_field_fields(file_fields)
24}
25
26/// The schema for a GTF file
27fn file_fields() -> Vec<Field> {
28    let attribute_key_field = Field::new("keys", DataType::Utf8, false);
29    let attribute_value_field = Field::new("values", DataType::Utf8, true);
30
31    vec![
32        // https://useast.ensembl.org/info/website/upload/gff.html
33        Field::new("seqname", DataType::Utf8, false),
34        Field::new("source", DataType::Utf8, true),
35        Field::new("type", DataType::Utf8, false),
36        Field::new("start", DataType::Int64, false),
37        Field::new("end", DataType::Int64, false),
38        Field::new("score", DataType::Float32, true),
39        Field::new("strand", DataType::Utf8, false),
40        Field::new("frame", DataType::Utf8, true),
41        Field::new_map(
42            "attributes",
43            "entries",
44            attribute_key_field,
45            attribute_value_field,
46            false,
47            true,
48        ),
49    ]
50}
51
52/// Configuration for a GTF data source.
53pub struct GTFConfig {
54    /// The number of rows to read at a time.
55    pub batch_size: usize,
56
57    /// The schema of the GTF file. This is static.
58    pub file_schema: SchemaRef,
59
60    /// The object store to use for reading GTF files.
61    pub object_store: Arc<dyn ObjectStore>,
62
63    /// Any projections to apply to the resulting batches.
64    pub projection: Option<Vec<usize>>,
65}
66
67impl GTFConfig {
68    /// Create a new GTF configuration.
69    pub fn new(object_store: Arc<dyn ObjectStore>, file_schema: SchemaRef) -> Self {
70        Self {
71            file_schema,
72            object_store,
73            batch_size: DEFAULT_BATCH_SIZE,
74            projection: None,
75        }
76    }
77
78    /// Set the file schema.
79    pub fn with_schema(mut self, file_schema: SchemaRef) -> Self {
80        self.file_schema = file_schema;
81        self
82    }
83
84    /// Set the batch size.
85    pub fn with_batch_size(mut self, batch_size: usize) -> Self {
86        self.batch_size = batch_size;
87        self
88    }
89
90    /// Set the projection.
91    pub fn with_projection(mut self, projection: Vec<usize>) -> Self {
92        self.projection = Some(projection);
93        self
94    }
95
96    /// Set the projection from an optional vector.
97    pub fn with_some_projection(mut self, projection: Option<Vec<usize>>) -> Self {
98        self.projection = projection;
99        self
100    }
101}