exon_fasta/
config.rs

1// Copyright 2023 WHERE TRUE Technologies.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::{str::FromStr, sync::Arc};
16
17use arrow::datatypes::{DataType, Field, SchemaRef};
18use exon_common::TableSchema;
19use noodles::core::Region;
20use object_store::ObjectStore;
21
22use crate::ExonFASTAError;
23
24#[derive(Debug, Clone)]
25pub enum SequenceDataType {
26    Utf8,
27    LargeUtf8,
28    IntegerEncodeProtein,
29    IntegerEncodeDNA,
30}
31
32impl FromStr for SequenceDataType {
33    type Err = ExonFASTAError;
34
35    fn from_str(s: &str) -> Result<Self, Self::Err> {
36        match s {
37            "utf8" => Ok(Self::Utf8),
38            "large_utf8" => Ok(Self::LargeUtf8),
39            "integer_encode_protein" => Ok(Self::IntegerEncodeProtein),
40            "integer_encode_dna" => Ok(Self::IntegerEncodeDNA),
41            _ => Err(ExonFASTAError::InvalidSequenceDataType(s.to_string())),
42        }
43    }
44}
45
46/// Configuration for a FASTA data source.
47#[derive(Debug)]
48pub struct FASTAConfig {
49    /// The number of rows to read at a time.
50    pub batch_size: usize,
51
52    /// The schema of the FASTA file.
53    pub file_schema: SchemaRef,
54
55    /// The object store to use for reading FASTA files.
56    pub object_store: Arc<dyn ObjectStore>,
57
58    /// Any projections to apply to the resulting batches.
59    pub projection: Option<Vec<usize>>,
60
61    /// How many bytes to pre-allocate for the sequence.
62    pub fasta_sequence_buffer_capacity: usize,
63
64    /// The type of data to use for the sequence.
65    pub sequence_data_type: SequenceDataType,
66
67    /// An optional region to read from.
68    pub region: Option<Region>,
69
70    /// An optional region file to read from.
71    pub region_file: Option<String>,
72}
73
74impl FASTAConfig {
75    /// Create a new FASTA configuration.
76    pub fn new(object_store: Arc<dyn ObjectStore>, file_schema: SchemaRef) -> Self {
77        Self {
78            object_store,
79            file_schema,
80            batch_size: exon_common::DEFAULT_BATCH_SIZE,
81            projection: None,
82            fasta_sequence_buffer_capacity: 384,
83            sequence_data_type: SequenceDataType::Utf8,
84            region: None,
85            region_file: None,
86        }
87    }
88
89    /// Create a new FASTA configuration with a given region.
90    pub fn with_region(mut self, region: Region) -> Self {
91        self.region = Some(region);
92        self
93    }
94
95    /// Create a new FASTA configuration with a given region file.
96    pub fn with_region_file(mut self, region_file: String) -> Self {
97        self.region_file = Some(region_file);
98        self
99    }
100
101    /// Create a new FASTA configuration with a given batch size.
102    pub fn with_batch_size(mut self, batch_size: usize) -> Self {
103        self.batch_size = batch_size;
104        self
105    }
106
107    /// Get the projection, returning the identity projection if none is set.
108    pub fn projection(&self) -> Vec<usize> {
109        self.projection
110            .clone()
111            .unwrap_or_else(|| (0..self.file_schema.fields().len()).collect())
112    }
113
114    /// Get the projected schema.
115    pub fn projected_schema(&self) -> arrow::error::Result<SchemaRef> {
116        let schema = self.file_schema.project(&self.projection())?;
117
118        Ok(Arc::new(schema))
119    }
120
121    /// Create a new FASTA configuration with a given projection.
122    pub fn with_projection(mut self, projection: Vec<usize>) -> Self {
123        // Only include fields that are in the file schema.
124        // TODO: make this cleaner, i.e. projection should probably come
125        // pre-filtered.
126        let file_projection = projection
127            .iter()
128            .filter(|f| **f < self.file_schema.fields().len())
129            .cloned()
130            .collect::<Vec<_>>();
131
132        self.projection = Some(file_projection);
133        self
134    }
135
136    /// Create a new FASTA configuration with a given sequence capacity.
137    pub fn with_fasta_sequence_buffer_capacity(
138        mut self,
139        fasta_sequence_buffer_capacity: usize,
140    ) -> Self {
141        self.fasta_sequence_buffer_capacity = fasta_sequence_buffer_capacity;
142        self
143    }
144
145    pub fn with_sequence_data_type(mut self, sequence_data_type: SequenceDataType) -> Self {
146        self.sequence_data_type = sequence_data_type;
147        self
148    }
149}
150
151pub struct FASTASchemaBuilder {
152    /// The fields of the schema.
153    fields: Vec<Field>,
154
155    /// The partition fields to potentially add to the schema.
156    partition_fields: Vec<Field>,
157
158    /// The sequence data type.
159    sequence_data_type: SequenceDataType,
160}
161
162impl Default for FASTASchemaBuilder {
163    fn default() -> Self {
164        Self {
165            fields: vec![
166                Field::new("id", DataType::Utf8, false),
167                Field::new("description", DataType::Utf8, true),
168                Field::new("sequence", DataType::Utf8, false),
169            ],
170            partition_fields: vec![],
171            sequence_data_type: SequenceDataType::Utf8,
172        }
173    }
174}
175
176impl FASTASchemaBuilder {
177    /// Set the type of sequence to store.
178    pub fn with_sequence_data_type(mut self, sequence_data_type: SequenceDataType) -> Self {
179        self.sequence_data_type = sequence_data_type;
180        self
181    }
182
183    /// Extend the partition fields with the given fields.
184    pub fn with_partition_fields(mut self, partition_fields: Vec<Field>) -> Self {
185        self.partition_fields.extend(partition_fields);
186        self
187    }
188
189    pub fn build(&mut self) -> TableSchema {
190        let mut fields = self.fields.clone();
191
192        match self.sequence_data_type {
193            SequenceDataType::Utf8 => {
194                let field = Field::new("sequence", DataType::Utf8, true);
195                fields[2] = field;
196            }
197            SequenceDataType::LargeUtf8 => {
198                let field = Field::new("sequence", DataType::LargeUtf8, true);
199                fields[2] = field;
200            }
201            SequenceDataType::IntegerEncodeProtein => {
202                let data_type = DataType::List(Arc::new(Field::new("item", DataType::Int8, true)));
203
204                let field = Field::new("sequence", data_type, true);
205                fields[2] = field;
206            }
207            SequenceDataType::IntegerEncodeDNA => {
208                let data_type = DataType::List(Arc::new(Field::new("item", DataType::Int8, true)));
209
210                let field = Field::new("sequence", data_type, true);
211                fields[2] = field;
212            }
213        }
214
215        let file_field_projection = self
216            .fields
217            .iter()
218            .enumerate()
219            .map(|(i, _)| i)
220            .collect::<Vec<_>>();
221
222        fields.extend(self.partition_fields.clone());
223
224        let arrow_schema = Arc::new(arrow::datatypes::Schema::new(fields.clone()));
225        TableSchema::new(arrow_schema, file_field_projection)
226    }
227}