exon_gff/
array_builder.rs1use std::sync::Arc;
16
17use arrow::{
18 array::{
19 ArrayRef, Float32Builder, GenericListBuilder, GenericStringBuilder, Int64Builder,
20 MapBuilder,
21 },
22 datatypes::SchemaRef,
23 error::ArrowError,
24};
25use exon_common::ExonArrayBuilder;
26use noodles::gff::Record;
27
28pub struct GFFArrayBuilder {
29 seqnames: GenericStringBuilder<i32>,
30 sources: GenericStringBuilder<i32>,
31 feature_types: GenericStringBuilder<i32>,
32 starts: Int64Builder,
33 ends: Int64Builder,
34 scores: Float32Builder,
35 strands: GenericStringBuilder<i32>,
36 phases: GenericStringBuilder<i32>,
37 attributes:
38 MapBuilder<GenericStringBuilder<i32>, GenericListBuilder<i32, GenericStringBuilder<i32>>>,
39
40 projection: Vec<usize>,
41 rows: usize,
42}
43
44impl GFFArrayBuilder {
45 pub fn new(schema: SchemaRef, projection: Option<Vec<usize>>) -> Self {
46 let projection = match projection {
47 Some(projection) => projection,
48 None => (0..schema.fields().len()).collect(),
49 };
50
51 Self {
52 seqnames: GenericStringBuilder::<i32>::new(),
53 sources: GenericStringBuilder::<i32>::new(),
54 feature_types: GenericStringBuilder::<i32>::new(),
55 starts: Int64Builder::new(),
56 ends: Int64Builder::new(),
57 scores: Float32Builder::new(),
58 strands: GenericStringBuilder::<i32>::new(),
59 phases: GenericStringBuilder::<i32>::new(),
60 attributes: MapBuilder::new(
61 None,
62 GenericStringBuilder::<i32>::new(),
63 GenericListBuilder::<i32, GenericStringBuilder<i32>>::new(GenericStringBuilder::<
64 i32,
65 >::new()),
66 ),
67 rows: 0,
68 projection,
69 }
70 }
71
72 pub fn len(&self) -> usize {
74 self.rows
75 }
76
77 pub fn is_empty(&self) -> bool {
79 self.len() == 0
80 }
81
82 pub fn append(&mut self, record: &Record) -> Result<(), ArrowError> {
83 for col_idx in self.projection.iter() {
84 match col_idx {
85 0 => self.seqnames.append_value(record.reference_sequence_name()),
86 1 => self.sources.append_value(record.source()),
87 2 => self.feature_types.append_value(record.ty()),
88 3 => {
89 let start_pos = record.start()?;
90 self.starts.append_value(start_pos.get() as i64)
91 }
92 4 => {
93 let end_pos = record.end()?;
94 self.ends.append_value(end_pos.get() as i64)
95 }
96 5 => {
97 let score = record.score();
98
99 match score {
100 Some(Ok(score)) => {
101 self.scores.append_value(score);
102 }
103 Some(Err(e)) => return Err(ArrowError::ExternalError(Box::new(e))),
104 None => self.scores.append_null(),
105 }
106 }
107 6 => {
108 let strand = record.strand()?;
109
110 if strand.as_ref() == "" || strand.as_ref() == "." {
111 self.strands.append_null();
112 } else {
113 self.strands.append_value(strand);
114 }
115 }
116 7 => {
117 let phase = record.phase();
118
119 match phase {
120 Some(Ok(phase)) => {
121 self.phases.append_value(phase);
122 }
123 Some(Err(e)) => return Err(ArrowError::ExternalError(Box::new(e))),
124 None => self.phases.append_null(),
125 }
126 }
127 8 => {
128 for resp in record.attributes().iter() {
129 let (key, value) = resp?;
130
131 self.attributes.keys().append_value(key);
132
133 match value {
134 noodles::gff::record::attributes::field::Value::String(value) => {
135 self.attributes.values().append(true);
136 self.attributes.values().values().append_value(value);
137 }
138 noodles::gff::record::attributes::field::Value::Array(attr_values) => {
139 let list_values = self.attributes.values().values();
140 for value in attr_values.iter() {
141 let value = value?;
142
143 list_values.append_value(value);
144 }
145 self.attributes.values().append(true);
146 }
147 }
148 }
149
150 self.attributes.append(true)?;
151 }
152 _ => {
153 return Err(ArrowError::ExternalError(
154 "Unexpected number of columns in projections".into(),
155 ))
156 }
157 }
158 }
159
160 self.rows += 1;
161 Ok(())
162 }
163
164 pub fn finish(&mut self) -> Vec<ArrayRef> {
165 let mut arrays: Vec<ArrayRef> = Vec::with_capacity(self.projection.len());
166
167 for col_idx in self.projection.iter() {
168 match col_idx {
169 0 => arrays.push(Arc::new(self.seqnames.finish())),
170 1 => arrays.push(Arc::new(self.sources.finish())),
171 2 => arrays.push(Arc::new(self.feature_types.finish())),
172 3 => arrays.push(Arc::new(self.starts.finish())),
173 4 => arrays.push(Arc::new(self.ends.finish())),
174 5 => arrays.push(Arc::new(self.scores.finish())),
175 6 => arrays.push(Arc::new(self.strands.finish())),
176 7 => arrays.push(Arc::new(self.phases.finish())),
177 8 => arrays.push(Arc::new(self.attributes.finish())),
178 _ => panic!("Invalid col_idx for GFF ({})", col_idx),
179 }
180 }
181
182 arrays
183 }
184}
185
186impl ExonArrayBuilder for GFFArrayBuilder {
187 fn finish(&mut self) -> Vec<ArrayRef> {
189 self.finish()
190 }
191
192 fn len(&self) -> usize {
194 self.rows
195 }
196}