exon_gff/
array_builder.rs

1// Copyright 2023 WHERE TRUE Technologies.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::sync::Arc;
16
17use arrow::{
18    array::{
19        ArrayRef, Float32Builder, GenericListBuilder, GenericStringBuilder, Int64Builder,
20        MapBuilder,
21    },
22    datatypes::SchemaRef,
23    error::ArrowError,
24};
25use exon_common::ExonArrayBuilder;
26use noodles::gff::Record;
27
28pub struct GFFArrayBuilder {
29    seqnames: GenericStringBuilder<i32>,
30    sources: GenericStringBuilder<i32>,
31    feature_types: GenericStringBuilder<i32>,
32    starts: Int64Builder,
33    ends: Int64Builder,
34    scores: Float32Builder,
35    strands: GenericStringBuilder<i32>,
36    phases: GenericStringBuilder<i32>,
37    attributes:
38        MapBuilder<GenericStringBuilder<i32>, GenericListBuilder<i32, GenericStringBuilder<i32>>>,
39
40    projection: Vec<usize>,
41    rows: usize,
42}
43
44impl GFFArrayBuilder {
45    pub fn new(schema: SchemaRef, projection: Option<Vec<usize>>) -> Self {
46        let projection = match projection {
47            Some(projection) => projection,
48            None => (0..schema.fields().len()).collect(),
49        };
50
51        Self {
52            seqnames: GenericStringBuilder::<i32>::new(),
53            sources: GenericStringBuilder::<i32>::new(),
54            feature_types: GenericStringBuilder::<i32>::new(),
55            starts: Int64Builder::new(),
56            ends: Int64Builder::new(),
57            scores: Float32Builder::new(),
58            strands: GenericStringBuilder::<i32>::new(),
59            phases: GenericStringBuilder::<i32>::new(),
60            attributes: MapBuilder::new(
61                None,
62                GenericStringBuilder::<i32>::new(),
63                GenericListBuilder::<i32, GenericStringBuilder<i32>>::new(GenericStringBuilder::<
64                    i32,
65                >::new()),
66            ),
67            rows: 0,
68            projection,
69        }
70    }
71
72    /// Returns the number of records in the array builder.
73    pub fn len(&self) -> usize {
74        self.rows
75    }
76
77    /// Returns whether the array builder is empty.
78    pub fn is_empty(&self) -> bool {
79        self.len() == 0
80    }
81
82    pub fn append(&mut self, record: &Record) -> Result<(), ArrowError> {
83        for col_idx in self.projection.iter() {
84            match col_idx {
85                0 => self.seqnames.append_value(record.reference_sequence_name()),
86                1 => self.sources.append_value(record.source()),
87                2 => self.feature_types.append_value(record.ty()),
88                3 => {
89                    let start_pos = record.start()?;
90                    self.starts.append_value(start_pos.get() as i64)
91                }
92                4 => {
93                    let end_pos = record.end()?;
94                    self.ends.append_value(end_pos.get() as i64)
95                }
96                5 => {
97                    let score = record.score();
98
99                    match score {
100                        Some(Ok(score)) => {
101                            self.scores.append_value(score);
102                        }
103                        Some(Err(e)) => return Err(ArrowError::ExternalError(Box::new(e))),
104                        None => self.scores.append_null(),
105                    }
106                }
107                6 => {
108                    let strand = record.strand()?;
109
110                    if strand.as_ref() == "" || strand.as_ref() == "." {
111                        self.strands.append_null();
112                    } else {
113                        self.strands.append_value(strand);
114                    }
115                }
116                7 => {
117                    let phase = record.phase();
118
119                    match phase {
120                        Some(Ok(phase)) => {
121                            self.phases.append_value(phase);
122                        }
123                        Some(Err(e)) => return Err(ArrowError::ExternalError(Box::new(e))),
124                        None => self.phases.append_null(),
125                    }
126                }
127                8 => {
128                    for resp in record.attributes().iter() {
129                        let (key, value) = resp?;
130
131                        self.attributes.keys().append_value(key);
132
133                        match value {
134                            noodles::gff::record::attributes::field::Value::String(value) => {
135                                self.attributes.values().append(true);
136                                self.attributes.values().values().append_value(value);
137                            }
138                            noodles::gff::record::attributes::field::Value::Array(attr_values) => {
139                                let list_values = self.attributes.values().values();
140                                for value in attr_values.iter() {
141                                    let value = value?;
142
143                                    list_values.append_value(value);
144                                }
145                                self.attributes.values().append(true);
146                            }
147                        }
148                    }
149
150                    self.attributes.append(true)?;
151                }
152                _ => {
153                    return Err(ArrowError::ExternalError(
154                        "Unexpected number of columns in projections".into(),
155                    ))
156                }
157            }
158        }
159
160        self.rows += 1;
161        Ok(())
162    }
163
164    pub fn finish(&mut self) -> Vec<ArrayRef> {
165        let mut arrays: Vec<ArrayRef> = Vec::with_capacity(self.projection.len());
166
167        for col_idx in self.projection.iter() {
168            match col_idx {
169                0 => arrays.push(Arc::new(self.seqnames.finish())),
170                1 => arrays.push(Arc::new(self.sources.finish())),
171                2 => arrays.push(Arc::new(self.feature_types.finish())),
172                3 => arrays.push(Arc::new(self.starts.finish())),
173                4 => arrays.push(Arc::new(self.ends.finish())),
174                5 => arrays.push(Arc::new(self.scores.finish())),
175                6 => arrays.push(Arc::new(self.strands.finish())),
176                7 => arrays.push(Arc::new(self.phases.finish())),
177                8 => arrays.push(Arc::new(self.attributes.finish())),
178                _ => panic!("Invalid col_idx for GFF ({})", col_idx),
179            }
180        }
181
182        arrays
183    }
184}
185
186impl ExonArrayBuilder for GFFArrayBuilder {
187    /// Finishes building the internal data structures and returns the built arrays.
188    fn finish(&mut self) -> Vec<ArrayRef> {
189        self.finish()
190    }
191
192    /// Returns the number of elements in the array.
193    fn len(&self) -> usize {
194        self.rows
195    }
196}