exon_fasta/
array_builder.rs

1// Copyright 2023 WHERE TRUE Technologies.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::{str::FromStr, sync::Arc};
16
17use arrow::{
18    array::{ArrayRef, GenericListBuilder, GenericStringBuilder, Int8Builder},
19    datatypes::SchemaRef,
20    error::ArrowError,
21};
22use exon_common::ExonArrayBuilder;
23use noodles::fasta::record::Definition;
24
25use crate::{ExonFASTAError, SequenceDataType};
26
27pub struct FASTAArrayBuilder {
28    names: GenericStringBuilder<i32>,
29    descriptions: GenericStringBuilder<i32>,
30    sequences: SequenceBuilder,
31    projection: Vec<usize>,
32    append_name: bool,
33    append_description: bool,
34    append_sequence: bool,
35    rows: usize,
36}
37
38pub enum SequenceBuilder {
39    Utf8(GenericStringBuilder<i32>),
40    LargeUtf8(GenericStringBuilder<i64>),
41    IntegerEncodeDNA(GenericListBuilder<i32, Int8Builder>),
42    IntegerEncodeProtein(GenericListBuilder<i32, Int8Builder>),
43}
44
45impl SequenceBuilder {
46    fn finish(&mut self) -> ArrayRef {
47        match self {
48            Self::Utf8(ref mut builder) => Arc::new(builder.finish()),
49            Self::LargeUtf8(ref mut builder) => Arc::new(builder.finish()),
50            Self::IntegerEncodeProtein(ref mut builder) => Arc::new(builder.finish()),
51            Self::IntegerEncodeDNA(ref mut builder) => Arc::new(builder.finish()),
52        }
53    }
54}
55
56impl FASTAArrayBuilder {
57    /// Create a new FASTA array builder.
58    pub fn create(
59        schema: SchemaRef,
60        projection: Option<Vec<usize>>,
61        capacity: usize,
62        sequence_data_type: &SequenceDataType,
63    ) -> Result<Self, ArrowError> {
64        let sequence_builder = match sequence_data_type {
65            SequenceDataType::Utf8 => SequenceBuilder::Utf8(
66                GenericStringBuilder::<i32>::with_capacity(capacity, capacity),
67            ),
68            SequenceDataType::LargeUtf8 => SequenceBuilder::LargeUtf8(
69                GenericStringBuilder::<i64>::with_capacity(capacity, capacity),
70            ),
71            SequenceDataType::IntegerEncodeProtein => SequenceBuilder::IntegerEncodeProtein(
72                GenericListBuilder::<i32, Int8Builder>::with_capacity(
73                    Int8Builder::with_capacity(60),
74                    capacity,
75                ),
76            ),
77            SequenceDataType::IntegerEncodeDNA => SequenceBuilder::IntegerEncodeDNA(
78                GenericListBuilder::<i32, Int8Builder>::with_capacity(
79                    Int8Builder::with_capacity(60),
80                    capacity,
81                ),
82            ),
83        };
84
85        let projection = match projection {
86            Some(projection) => projection,
87            None => (0..schema.fields().len()).collect(),
88        };
89
90        let append_name = projection.contains(&0);
91        let append_description = projection.contains(&1);
92        let append_sequence = projection.contains(&2);
93
94        Ok(Self {
95            names: GenericStringBuilder::<i32>::with_capacity(capacity, capacity),
96            descriptions: GenericStringBuilder::<i32>::with_capacity(capacity, capacity),
97            sequences: sequence_builder,
98            projection,
99            rows: 0,
100            append_sequence,
101            append_name,
102            append_description,
103        })
104    }
105
106    pub fn len(&self) -> usize {
107        self.rows
108    }
109
110    pub fn is_empty(&self) -> bool {
111        self.len() == 0
112    }
113
114    pub fn append(&mut self, definition: &str, sequence: &[u8]) -> Result<(), ArrowError> {
115        if self.append_name || self.append_description {
116            let definition = Definition::from_str(definition)
117                .map_err(|e| ArrowError::ExternalError(Box::new(e)))?;
118
119            if self.append_name {
120                let name = std::str::from_utf8(definition.name())?;
121                self.names.append_value(name);
122            }
123
124            if self.append_description {
125                if let Some(description) = definition.description() {
126                    let description = std::str::from_utf8(description)?;
127                    self.descriptions.append_value(description);
128                } else {
129                    self.descriptions.append_null();
130                }
131            }
132        }
133
134        if self.append_sequence {
135            match &mut self.sequences {
136                SequenceBuilder::Utf8(ref mut builder) => {
137                    let sequence = std::str::from_utf8(sequence)?;
138                    builder.append_value(sequence);
139                }
140                SequenceBuilder::LargeUtf8(ref mut builder) => {
141                    let sequence = std::str::from_utf8(sequence)?;
142                    builder.append_value(sequence);
143                }
144                SequenceBuilder::IntegerEncodeProtein(ref mut builder) => {
145                    let values = builder.values();
146
147                    for aa in sequence {
148                        let aa = match aa {
149                            b'A' => 1,
150                            b'B' => 2,
151                            b'C' => 3,
152                            b'D' => 4,
153                            b'E' => 5,
154                            b'F' => 6,
155                            b'G' => 7,
156                            b'H' => 8,
157                            b'I' => 9,
158                            b'K' => 10,
159                            b'L' => 11,
160                            b'M' => 12,
161                            b'N' => 13,
162                            b'O' => 14,
163                            b'P' => 15,
164                            b'Q' => 16,
165                            b'R' => 17,
166                            b'S' => 18,
167                            b'T' => 19,
168                            b'U' => 20,
169                            b'V' => 21,
170                            b'W' => 22,
171                            b'Y' => 23,
172                            b'X' => 24,
173                            b'Z' => 25,
174                            _ => {
175                                return Err(ExonFASTAError::InvalidAminoAcid(*aa).into());
176                            }
177                        };
178
179                        values.append_value(aa);
180                    }
181
182                    builder.append(true);
183                }
184                SequenceBuilder::IntegerEncodeDNA(ref mut builder) => {
185                    let values = builder.values();
186
187                    // Convert the DNA sequence to one-hot encoding, use A => 1, C => 2, G => 3, T => 4, N => 5
188                    // error for non-ACGTN characters
189                    for nt in sequence {
190                        let nt = match nt {
191                            b'A' => 1,
192                            b'C' => 2,
193                            b'G' => 3,
194                            b'T' => 4,
195                            b'N' => 5,
196                            _ => {
197                                return Err(ExonFASTAError::InvalidNucleotide(*nt).into());
198                            }
199                        };
200
201                        values.append_value(nt);
202                    }
203
204                    builder.append(true);
205                }
206            }
207        }
208
209        self.rows += 1;
210        Ok(())
211    }
212
213    fn finish_inner(&mut self) -> Vec<ArrayRef> {
214        let mut arrays: Vec<ArrayRef> = Vec::with_capacity(self.projection.len());
215
216        if self.append_name {
217            arrays.push(Arc::new(self.names.finish()));
218        }
219
220        if self.append_description {
221            arrays.push(Arc::new(self.descriptions.finish()));
222        }
223
224        if self.append_sequence {
225            arrays.push(self.sequences.finish());
226        }
227
228        arrays
229    }
230}
231
232impl ExonArrayBuilder for FASTAArrayBuilder {
233    fn finish(&mut self) -> Vec<ArrayRef> {
234        self.finish_inner()
235    }
236
237    fn len(&self) -> usize {
238        self.len()
239    }
240}