1use std::{str::FromStr, sync::Arc};
16
17use arrow::{
18 array::{ArrayRef, GenericListBuilder, GenericStringBuilder, Int8Builder},
19 datatypes::SchemaRef,
20 error::ArrowError,
21};
22use exon_common::ExonArrayBuilder;
23use noodles::fasta::record::Definition;
24
25use crate::{ExonFASTAError, SequenceDataType};
26
27pub struct FASTAArrayBuilder {
28 names: GenericStringBuilder<i32>,
29 descriptions: GenericStringBuilder<i32>,
30 sequences: SequenceBuilder,
31 projection: Vec<usize>,
32 append_name: bool,
33 append_description: bool,
34 append_sequence: bool,
35 rows: usize,
36}
37
38pub enum SequenceBuilder {
39 Utf8(GenericStringBuilder<i32>),
40 LargeUtf8(GenericStringBuilder<i64>),
41 IntegerEncodeDNA(GenericListBuilder<i32, Int8Builder>),
42 IntegerEncodeProtein(GenericListBuilder<i32, Int8Builder>),
43}
44
45impl SequenceBuilder {
46 fn finish(&mut self) -> ArrayRef {
47 match self {
48 Self::Utf8(ref mut builder) => Arc::new(builder.finish()),
49 Self::LargeUtf8(ref mut builder) => Arc::new(builder.finish()),
50 Self::IntegerEncodeProtein(ref mut builder) => Arc::new(builder.finish()),
51 Self::IntegerEncodeDNA(ref mut builder) => Arc::new(builder.finish()),
52 }
53 }
54}
55
56impl FASTAArrayBuilder {
57 pub fn create(
59 schema: SchemaRef,
60 projection: Option<Vec<usize>>,
61 capacity: usize,
62 sequence_data_type: &SequenceDataType,
63 ) -> Result<Self, ArrowError> {
64 let sequence_builder = match sequence_data_type {
65 SequenceDataType::Utf8 => SequenceBuilder::Utf8(
66 GenericStringBuilder::<i32>::with_capacity(capacity, capacity),
67 ),
68 SequenceDataType::LargeUtf8 => SequenceBuilder::LargeUtf8(
69 GenericStringBuilder::<i64>::with_capacity(capacity, capacity),
70 ),
71 SequenceDataType::IntegerEncodeProtein => SequenceBuilder::IntegerEncodeProtein(
72 GenericListBuilder::<i32, Int8Builder>::with_capacity(
73 Int8Builder::with_capacity(60),
74 capacity,
75 ),
76 ),
77 SequenceDataType::IntegerEncodeDNA => SequenceBuilder::IntegerEncodeDNA(
78 GenericListBuilder::<i32, Int8Builder>::with_capacity(
79 Int8Builder::with_capacity(60),
80 capacity,
81 ),
82 ),
83 };
84
85 let projection = match projection {
86 Some(projection) => projection,
87 None => (0..schema.fields().len()).collect(),
88 };
89
90 let append_name = projection.contains(&0);
91 let append_description = projection.contains(&1);
92 let append_sequence = projection.contains(&2);
93
94 Ok(Self {
95 names: GenericStringBuilder::<i32>::with_capacity(capacity, capacity),
96 descriptions: GenericStringBuilder::<i32>::with_capacity(capacity, capacity),
97 sequences: sequence_builder,
98 projection,
99 rows: 0,
100 append_sequence,
101 append_name,
102 append_description,
103 })
104 }
105
106 pub fn len(&self) -> usize {
107 self.rows
108 }
109
110 pub fn is_empty(&self) -> bool {
111 self.len() == 0
112 }
113
114 pub fn append(&mut self, definition: &str, sequence: &[u8]) -> Result<(), ArrowError> {
115 if self.append_name || self.append_description {
116 let definition = Definition::from_str(definition)
117 .map_err(|e| ArrowError::ExternalError(Box::new(e)))?;
118
119 if self.append_name {
120 let name = std::str::from_utf8(definition.name())?;
121 self.names.append_value(name);
122 }
123
124 if self.append_description {
125 if let Some(description) = definition.description() {
126 let description = std::str::from_utf8(description)?;
127 self.descriptions.append_value(description);
128 } else {
129 self.descriptions.append_null();
130 }
131 }
132 }
133
134 if self.append_sequence {
135 match &mut self.sequences {
136 SequenceBuilder::Utf8(ref mut builder) => {
137 let sequence = std::str::from_utf8(sequence)?;
138 builder.append_value(sequence);
139 }
140 SequenceBuilder::LargeUtf8(ref mut builder) => {
141 let sequence = std::str::from_utf8(sequence)?;
142 builder.append_value(sequence);
143 }
144 SequenceBuilder::IntegerEncodeProtein(ref mut builder) => {
145 let values = builder.values();
146
147 for aa in sequence {
148 let aa = match aa {
149 b'A' => 1,
150 b'B' => 2,
151 b'C' => 3,
152 b'D' => 4,
153 b'E' => 5,
154 b'F' => 6,
155 b'G' => 7,
156 b'H' => 8,
157 b'I' => 9,
158 b'K' => 10,
159 b'L' => 11,
160 b'M' => 12,
161 b'N' => 13,
162 b'O' => 14,
163 b'P' => 15,
164 b'Q' => 16,
165 b'R' => 17,
166 b'S' => 18,
167 b'T' => 19,
168 b'U' => 20,
169 b'V' => 21,
170 b'W' => 22,
171 b'Y' => 23,
172 b'X' => 24,
173 b'Z' => 25,
174 _ => {
175 return Err(ExonFASTAError::InvalidAminoAcid(*aa).into());
176 }
177 };
178
179 values.append_value(aa);
180 }
181
182 builder.append(true);
183 }
184 SequenceBuilder::IntegerEncodeDNA(ref mut builder) => {
185 let values = builder.values();
186
187 for nt in sequence {
190 let nt = match nt {
191 b'A' => 1,
192 b'C' => 2,
193 b'G' => 3,
194 b'T' => 4,
195 b'N' => 5,
196 _ => {
197 return Err(ExonFASTAError::InvalidNucleotide(*nt).into());
198 }
199 };
200
201 values.append_value(nt);
202 }
203
204 builder.append(true);
205 }
206 }
207 }
208
209 self.rows += 1;
210 Ok(())
211 }
212
213 fn finish_inner(&mut self) -> Vec<ArrayRef> {
214 let mut arrays: Vec<ArrayRef> = Vec::with_capacity(self.projection.len());
215
216 if self.append_name {
217 arrays.push(Arc::new(self.names.finish()));
218 }
219
220 if self.append_description {
221 arrays.push(Arc::new(self.descriptions.finish()));
222 }
223
224 if self.append_sequence {
225 arrays.push(self.sequences.finish());
226 }
227
228 arrays
229 }
230}
231
232impl ExonArrayBuilder for FASTAArrayBuilder {
233 fn finish(&mut self) -> Vec<ArrayRef> {
234 self.finish_inner()
235 }
236
237 fn len(&self) -> usize {
238 self.len()
239 }
240}