exon_vcf/array_builder/
eager_array_builder.rs

1// Copyright 2023 WHERE TRUE Technologies.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::sync::Arc;
16
17use arrow::{
18    array::{ArrayRef, Float32Builder, GenericListBuilder, GenericStringBuilder, Int64Builder},
19    datatypes::SchemaRef,
20    error::ArrowError,
21};
22use exon_common::ExonArrayBuilder;
23use noodles::vcf::{
24    variant::record::{AlternateBases, Filters, Ids},
25    Header,
26};
27
28use noodles::vcf::variant::Record as VCFRecord;
29
30use super::{GenotypeBuilder, InfosBuilder};
31
32/// A builder for creating a `ArrayRef` from a `VCF` file.
33pub struct VCFArrayBuilder {
34    chromosomes: GenericStringBuilder<i32>,
35    positions: Int64Builder,
36    ids: GenericListBuilder<i32, GenericStringBuilder<i32>>,
37    references: GenericStringBuilder<i32>,
38    alternates: GenericListBuilder<i32, GenericStringBuilder<i32>>,
39    qualities: Float32Builder,
40    filters: GenericListBuilder<i32, GenericStringBuilder<i32>>,
41
42    infos: InfosBuilder,
43    formats: GenotypeBuilder,
44
45    header: Arc<Header>,
46
47    projection: Vec<usize>,
48
49    n_rows: usize,
50}
51
52impl VCFArrayBuilder {
53    /// Creates a new `VCFArrayBuilder` from a `Schema`.
54    pub fn create(
55        schema: SchemaRef,
56        capacity: usize,
57        projection: Option<Vec<usize>>,
58        header: Arc<Header>,
59    ) -> Result<Self, ArrowError> {
60        let info_field = schema.field_with_name("info")?;
61        let format_field = schema.field_with_name("formats")?;
62
63        let projection = match projection {
64            Some(projection) => projection.to_vec(),
65            None => (0..schema.fields().len()).collect(),
66        };
67
68        Ok(Self {
69            n_rows: 0,
70            chromosomes: GenericStringBuilder::<i32>::new(),
71            positions: Int64Builder::new(),
72            ids: GenericListBuilder::<i32, GenericStringBuilder<i32>>::new(GenericStringBuilder::<
73                i32,
74            >::new()),
75            references: GenericStringBuilder::<i32>::new(),
76            alternates: GenericListBuilder::<i32, GenericStringBuilder<i32>>::new(
77                GenericStringBuilder::<i32>::new(),
78            ),
79            qualities: Float32Builder::new(),
80            filters: GenericListBuilder::<i32, GenericStringBuilder<i32>>::new(
81                GenericStringBuilder::<i32>::new(),
82            ),
83
84            infos: InfosBuilder::try_new(info_field, header.clone(), capacity)?,
85
86            formats: GenotypeBuilder::try_new(format_field, capacity)?,
87            header,
88
89            projection,
90        })
91    }
92
93    /// Appends a record to the builder.
94    pub fn append<T>(&mut self, record: T) -> Result<(), ArrowError>
95    where
96        T: VCFRecord,
97    {
98        for col_idx in self.projection.iter() {
99            match col_idx {
100                0 => {
101                    let chromosome = record.reference_sequence_name(&self.header)?.to_string();
102                    self.chromosomes.append_value(chromosome);
103                }
104                1 => {
105                    if let Some(position) = record.variant_start() {
106                        let position = position?;
107                        self.positions.append_value(position.get() as i64);
108                    } else {
109                        self.positions.append_null();
110                    }
111                }
112                2 => {
113                    for id in record.ids().iter() {
114                        self.ids.values().append_value(id);
115                    }
116
117                    self.ids.append(true);
118                }
119                3 => {
120                    let mut s = String::new();
121                    for base in record.reference_bases().iter() {
122                        let base = base?.into();
123                        s.push(base);
124                    }
125                    self.references.append_value(s);
126                }
127                4 => {
128                    for alt in record.alternate_bases().iter() {
129                        let alt = alt?;
130                        self.alternates.values().append_value(alt);
131                    }
132
133                    self.alternates.append(true);
134                }
135                5 => {
136                    let quality_score = record.quality_score().transpose()?;
137                    self.qualities.append_option(quality_score);
138                }
139                6 => {
140                    let filters = record.filters();
141
142                    for filter in filters.iter(&self.header) {
143                        let filter = filter?;
144                        self.filters.values().append_value(filter);
145                    }
146
147                    self.filters.append(true);
148                }
149                7 => {
150                    let info = record.info();
151                    self.infos.append_value(info)?;
152                }
153                8 => {
154                    let samples = record.samples()?;
155                    self.formats.append_value(samples, &self.header)?;
156                }
157                _ => Err(ArrowError::InvalidArgumentError(
158                    "Invalid column index".to_string(),
159                ))?,
160            }
161        }
162
163        self.n_rows += 1;
164
165        Ok(())
166    }
167}
168
169impl ExonArrayBuilder for VCFArrayBuilder {
170    fn finish(&mut self) -> Vec<ArrayRef> {
171        let mut arrays: Vec<ArrayRef> = vec![];
172
173        for col_idx in self.projection.iter() {
174            match col_idx {
175                0 => arrays.push(Arc::new(self.chromosomes.finish())),
176                1 => arrays.push(Arc::new(self.positions.finish())),
177                2 => arrays.push(Arc::new(self.ids.finish())),
178                3 => arrays.push(Arc::new(self.references.finish())),
179                4 => arrays.push(Arc::new(self.alternates.finish())),
180                5 => arrays.push(Arc::new(self.qualities.finish())),
181                6 => arrays.push(Arc::new(self.filters.finish())),
182                7 => arrays.push(Arc::new(self.infos.finish())),
183                8 => arrays.push(Arc::new(self.formats.finish())),
184                _ => panic!("Not implemented"),
185            }
186        }
187
188        arrays
189    }
190
191    fn len(&self) -> usize {
192        self.n_rows
193    }
194}