exon_vcf/array_builder/
eager_array_builder.rs1use std::sync::Arc;
16
17use arrow::{
18 array::{ArrayRef, Float32Builder, GenericListBuilder, GenericStringBuilder, Int64Builder},
19 datatypes::SchemaRef,
20 error::ArrowError,
21};
22use exon_common::ExonArrayBuilder;
23use noodles::vcf::{
24 variant::record::{AlternateBases, Filters, Ids},
25 Header,
26};
27
28use noodles::vcf::variant::Record as VCFRecord;
29
30use super::{GenotypeBuilder, InfosBuilder};
31
32pub struct VCFArrayBuilder {
34 chromosomes: GenericStringBuilder<i32>,
35 positions: Int64Builder,
36 ids: GenericListBuilder<i32, GenericStringBuilder<i32>>,
37 references: GenericStringBuilder<i32>,
38 alternates: GenericListBuilder<i32, GenericStringBuilder<i32>>,
39 qualities: Float32Builder,
40 filters: GenericListBuilder<i32, GenericStringBuilder<i32>>,
41
42 infos: InfosBuilder,
43 formats: GenotypeBuilder,
44
45 header: Arc<Header>,
46
47 projection: Vec<usize>,
48
49 n_rows: usize,
50}
51
52impl VCFArrayBuilder {
53 pub fn create(
55 schema: SchemaRef,
56 capacity: usize,
57 projection: Option<Vec<usize>>,
58 header: Arc<Header>,
59 ) -> Result<Self, ArrowError> {
60 let info_field = schema.field_with_name("info")?;
61 let format_field = schema.field_with_name("formats")?;
62
63 let projection = match projection {
64 Some(projection) => projection.to_vec(),
65 None => (0..schema.fields().len()).collect(),
66 };
67
68 Ok(Self {
69 n_rows: 0,
70 chromosomes: GenericStringBuilder::<i32>::new(),
71 positions: Int64Builder::new(),
72 ids: GenericListBuilder::<i32, GenericStringBuilder<i32>>::new(GenericStringBuilder::<
73 i32,
74 >::new()),
75 references: GenericStringBuilder::<i32>::new(),
76 alternates: GenericListBuilder::<i32, GenericStringBuilder<i32>>::new(
77 GenericStringBuilder::<i32>::new(),
78 ),
79 qualities: Float32Builder::new(),
80 filters: GenericListBuilder::<i32, GenericStringBuilder<i32>>::new(
81 GenericStringBuilder::<i32>::new(),
82 ),
83
84 infos: InfosBuilder::try_new(info_field, header.clone(), capacity)?,
85
86 formats: GenotypeBuilder::try_new(format_field, capacity)?,
87 header,
88
89 projection,
90 })
91 }
92
93 pub fn append<T>(&mut self, record: T) -> Result<(), ArrowError>
95 where
96 T: VCFRecord,
97 {
98 for col_idx in self.projection.iter() {
99 match col_idx {
100 0 => {
101 let chromosome = record.reference_sequence_name(&self.header)?.to_string();
102 self.chromosomes.append_value(chromosome);
103 }
104 1 => {
105 if let Some(position) = record.variant_start() {
106 let position = position?;
107 self.positions.append_value(position.get() as i64);
108 } else {
109 self.positions.append_null();
110 }
111 }
112 2 => {
113 for id in record.ids().iter() {
114 self.ids.values().append_value(id);
115 }
116
117 self.ids.append(true);
118 }
119 3 => {
120 let mut s = String::new();
121 for base in record.reference_bases().iter() {
122 let base = base?.into();
123 s.push(base);
124 }
125 self.references.append_value(s);
126 }
127 4 => {
128 for alt in record.alternate_bases().iter() {
129 let alt = alt?;
130 self.alternates.values().append_value(alt);
131 }
132
133 self.alternates.append(true);
134 }
135 5 => {
136 let quality_score = record.quality_score().transpose()?;
137 self.qualities.append_option(quality_score);
138 }
139 6 => {
140 let filters = record.filters();
141
142 for filter in filters.iter(&self.header) {
143 let filter = filter?;
144 self.filters.values().append_value(filter);
145 }
146
147 self.filters.append(true);
148 }
149 7 => {
150 let info = record.info();
151 self.infos.append_value(info)?;
152 }
153 8 => {
154 let samples = record.samples()?;
155 self.formats.append_value(samples, &self.header)?;
156 }
157 _ => Err(ArrowError::InvalidArgumentError(
158 "Invalid column index".to_string(),
159 ))?,
160 }
161 }
162
163 self.n_rows += 1;
164
165 Ok(())
166 }
167}
168
169impl ExonArrayBuilder for VCFArrayBuilder {
170 fn finish(&mut self) -> Vec<ArrayRef> {
171 let mut arrays: Vec<ArrayRef> = vec![];
172
173 for col_idx in self.projection.iter() {
174 match col_idx {
175 0 => arrays.push(Arc::new(self.chromosomes.finish())),
176 1 => arrays.push(Arc::new(self.positions.finish())),
177 2 => arrays.push(Arc::new(self.ids.finish())),
178 3 => arrays.push(Arc::new(self.references.finish())),
179 4 => arrays.push(Arc::new(self.alternates.finish())),
180 5 => arrays.push(Arc::new(self.qualities.finish())),
181 6 => arrays.push(Arc::new(self.filters.finish())),
182 7 => arrays.push(Arc::new(self.infos.finish())),
183 8 => arrays.push(Arc::new(self.formats.finish())),
184 _ => panic!("Not implemented"),
185 }
186 }
187
188 arrays
189 }
190
191 fn len(&self) -> usize {
192 self.n_rows
193 }
194}