exon_gff/
config.rs

1// Copyright 2023 WHERE TRUE Technologies.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::sync::Arc;
16
17use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
18use exon_common::TableSchemaBuilder;
19use object_store::ObjectStore;
20
21/// Configuration for a GFF data source.
22#[derive(Debug, Clone)]
23pub struct GFFConfig {
24    /// The number of rows to read at a time.
25    pub batch_size: usize,
26
27    /// The schema of the GFF file. This is static.
28    pub file_schema: SchemaRef,
29
30    /// The object store to use for reading GFF files.
31    pub object_store: Arc<dyn ObjectStore>,
32
33    /// Any projections to apply to the resulting batches.
34    pub projection: Option<Vec<usize>>,
35}
36
37impl GFFConfig {
38    /// Create a new GFF configuration.
39    pub fn new(object_store: Arc<dyn ObjectStore>, file_schema: Arc<Schema>) -> Self {
40        Self {
41            file_schema,
42            object_store,
43            batch_size: 8096,
44            projection: None,
45        }
46    }
47
48    /// Set the batch size.
49    pub fn with_batch_size(mut self, batch_size: usize) -> Self {
50        self.batch_size = batch_size;
51        self
52    }
53
54    /// Set the projection.
55    pub fn with_projection(mut self, projection: Vec<usize>) -> Self {
56        let file_projection = projection
57            .iter()
58            .filter(|f| **f < self.file_schema.fields().len())
59            .cloned()
60            .collect::<Vec<_>>();
61
62        self.projection = Some(file_projection);
63        self
64    }
65
66    /// Get the projection, returning the identity projection if none is set.
67    pub fn projection(&self) -> Vec<usize> {
68        self.projection
69            .clone()
70            .unwrap_or_else(|| (0..self.file_schema.fields().len()).collect())
71    }
72
73    /// Get the projected schema.
74    pub fn projected_schema(&self) -> arrow::error::Result<SchemaRef> {
75        let schema = self.file_schema.project(&self.projection())?;
76
77        Ok(Arc::new(schema))
78    }
79}
80
81pub fn new_gff_schema_builder() -> TableSchemaBuilder {
82    let attribute_key_field = Field::new("keys", DataType::Utf8, false);
83
84    // attribute_value_field is a list of strings
85    let value_field = Field::new("item", DataType::Utf8, true);
86    let attribute_value_field = Field::new("values", DataType::List(Arc::new(value_field)), true);
87
88    let fields = vec![
89        Field::new("seqname", DataType::Utf8, false),
90        Field::new("source", DataType::Utf8, true),
91        Field::new("type", DataType::Utf8, false),
92        Field::new("start", DataType::Int64, false),
93        Field::new("end", DataType::Int64, false),
94        Field::new("score", DataType::Float32, true),
95        Field::new("strand", DataType::Utf8, false),
96        Field::new("phase", DataType::Utf8, true),
97        Field::new_map(
98            "attributes",
99            "entries",
100            attribute_key_field,
101            attribute_value_field,
102            false,
103            true,
104        ),
105    ];
106
107    TableSchemaBuilder::new_with_field_fields(fields)
108}