1use std::{str::FromStr, sync::Arc};
16
17use arrow::datatypes::{DataType, Field, SchemaRef};
18use exon_common::TableSchema;
19use noodles::core::Region;
20use object_store::ObjectStore;
21
22use crate::ExonFASTAError;
23
24#[derive(Debug, Clone)]
25pub enum SequenceDataType {
26 Utf8,
27 LargeUtf8,
28 IntegerEncodeProtein,
29 IntegerEncodeDNA,
30}
31
32impl FromStr for SequenceDataType {
33 type Err = ExonFASTAError;
34
35 fn from_str(s: &str) -> Result<Self, Self::Err> {
36 match s {
37 "utf8" => Ok(Self::Utf8),
38 "large_utf8" => Ok(Self::LargeUtf8),
39 "integer_encode_protein" => Ok(Self::IntegerEncodeProtein),
40 "integer_encode_dna" => Ok(Self::IntegerEncodeDNA),
41 _ => Err(ExonFASTAError::InvalidSequenceDataType(s.to_string())),
42 }
43 }
44}
45
46#[derive(Debug)]
48pub struct FASTAConfig {
49 pub batch_size: usize,
51
52 pub file_schema: SchemaRef,
54
55 pub object_store: Arc<dyn ObjectStore>,
57
58 pub projection: Option<Vec<usize>>,
60
61 pub fasta_sequence_buffer_capacity: usize,
63
64 pub sequence_data_type: SequenceDataType,
66
67 pub region: Option<Region>,
69
70 pub region_file: Option<String>,
72}
73
74impl FASTAConfig {
75 pub fn new(object_store: Arc<dyn ObjectStore>, file_schema: SchemaRef) -> Self {
77 Self {
78 object_store,
79 file_schema,
80 batch_size: exon_common::DEFAULT_BATCH_SIZE,
81 projection: None,
82 fasta_sequence_buffer_capacity: 384,
83 sequence_data_type: SequenceDataType::Utf8,
84 region: None,
85 region_file: None,
86 }
87 }
88
89 pub fn with_region(mut self, region: Region) -> Self {
91 self.region = Some(region);
92 self
93 }
94
95 pub fn with_region_file(mut self, region_file: String) -> Self {
97 self.region_file = Some(region_file);
98 self
99 }
100
101 pub fn with_batch_size(mut self, batch_size: usize) -> Self {
103 self.batch_size = batch_size;
104 self
105 }
106
107 pub fn projection(&self) -> Vec<usize> {
109 self.projection
110 .clone()
111 .unwrap_or_else(|| (0..self.file_schema.fields().len()).collect())
112 }
113
114 pub fn projected_schema(&self) -> arrow::error::Result<SchemaRef> {
116 let schema = self.file_schema.project(&self.projection())?;
117
118 Ok(Arc::new(schema))
119 }
120
121 pub fn with_projection(mut self, projection: Vec<usize>) -> Self {
123 let file_projection = projection
127 .iter()
128 .filter(|f| **f < self.file_schema.fields().len())
129 .cloned()
130 .collect::<Vec<_>>();
131
132 self.projection = Some(file_projection);
133 self
134 }
135
136 pub fn with_fasta_sequence_buffer_capacity(
138 mut self,
139 fasta_sequence_buffer_capacity: usize,
140 ) -> Self {
141 self.fasta_sequence_buffer_capacity = fasta_sequence_buffer_capacity;
142 self
143 }
144
145 pub fn with_sequence_data_type(mut self, sequence_data_type: SequenceDataType) -> Self {
146 self.sequence_data_type = sequence_data_type;
147 self
148 }
149}
150
151pub struct FASTASchemaBuilder {
152 fields: Vec<Field>,
154
155 partition_fields: Vec<Field>,
157
158 sequence_data_type: SequenceDataType,
160}
161
162impl Default for FASTASchemaBuilder {
163 fn default() -> Self {
164 Self {
165 fields: vec![
166 Field::new("id", DataType::Utf8, false),
167 Field::new("description", DataType::Utf8, true),
168 Field::new("sequence", DataType::Utf8, false),
169 ],
170 partition_fields: vec![],
171 sequence_data_type: SequenceDataType::Utf8,
172 }
173 }
174}
175
176impl FASTASchemaBuilder {
177 pub fn with_sequence_data_type(mut self, sequence_data_type: SequenceDataType) -> Self {
179 self.sequence_data_type = sequence_data_type;
180 self
181 }
182
183 pub fn with_partition_fields(mut self, partition_fields: Vec<Field>) -> Self {
185 self.partition_fields.extend(partition_fields);
186 self
187 }
188
189 pub fn build(&mut self) -> TableSchema {
190 let mut fields = self.fields.clone();
191
192 match self.sequence_data_type {
193 SequenceDataType::Utf8 => {
194 let field = Field::new("sequence", DataType::Utf8, true);
195 fields[2] = field;
196 }
197 SequenceDataType::LargeUtf8 => {
198 let field = Field::new("sequence", DataType::LargeUtf8, true);
199 fields[2] = field;
200 }
201 SequenceDataType::IntegerEncodeProtein => {
202 let data_type = DataType::List(Arc::new(Field::new("item", DataType::Int8, true)));
203
204 let field = Field::new("sequence", data_type, true);
205 fields[2] = field;
206 }
207 SequenceDataType::IntegerEncodeDNA => {
208 let data_type = DataType::List(Arc::new(Field::new("item", DataType::Int8, true)));
209
210 let field = Field::new("sequence", data_type, true);
211 fields[2] = field;
212 }
213 }
214
215 let file_field_projection = self
216 .fields
217 .iter()
218 .enumerate()
219 .map(|(i, _)| i)
220 .collect::<Vec<_>>();
221
222 fields.extend(self.partition_fields.clone());
223
224 let arrow_schema = Arc::new(arrow::datatypes::Schema::new(fields.clone()));
225 TableSchema::new(arrow_schema, file_field_projection)
226 }
227}