exon_sam/
array_builder.rs

1// Copyright 2023 WHERE TRUE Technologies.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::sync::Arc;
16
17use arrow::{
18    array::{ArrayRef, GenericListBuilder, GenericStringBuilder, Int32Builder, Int64Builder},
19    error::ArrowError,
20    error::Result,
21};
22use exon_common::ExonArrayBuilder;
23use noodles::sam::alignment::{
24    record::{cigar::op::Kind, Cigar},
25    RecordBuf,
26};
27use noodles::sam::Header;
28
29use crate::{SAMConfig, TagsBuilder};
30
31/// Builds an vector of arrays from a SAM file.
32pub struct SAMArrayBuilder {
33    names: GenericStringBuilder<i32>,
34    flags: Int32Builder,
35    references: GenericStringBuilder<i32>,
36    starts: Int64Builder,
37    ends: Int64Builder,
38    mapping_qualities: GenericStringBuilder<i32>,
39    cigar: GenericStringBuilder<i32>,
40    mate_references: GenericStringBuilder<i32>,
41    sequences: GenericStringBuilder<i32>,
42    quality_scores: GenericListBuilder<i32, Int64Builder>,
43
44    tags: TagsBuilder,
45
46    projection: Vec<usize>,
47
48    rows: usize,
49
50    header: Header,
51}
52
53impl SAMArrayBuilder {
54    /// Creates a new SAM array builder.
55    pub fn create(header: Header, sam_config: Arc<SAMConfig>) -> Self {
56        let tags_builder = sam_config
57            .file_schema
58            .field_with_name("tags")
59            .map_or(TagsBuilder::default(), |field| {
60                TagsBuilder::try_from(field.data_type()).unwrap()
61            });
62
63        let projection = sam_config.projection();
64
65        let quality_scores = GenericListBuilder::<i32, Int64Builder>::new(Int64Builder::new());
66
67        Self {
68            names: GenericStringBuilder::<i32>::new(),
69            flags: Int32Builder::new(),
70            references: GenericStringBuilder::<i32>::new(),
71            starts: Int64Builder::new(),
72            ends: Int64Builder::new(),
73            mapping_qualities: GenericStringBuilder::<i32>::new(),
74            cigar: GenericStringBuilder::<i32>::new(),
75            mate_references: GenericStringBuilder::<i32>::new(),
76            sequences: GenericStringBuilder::<i32>::new(),
77            quality_scores,
78
79            tags: tags_builder,
80
81            projection,
82
83            rows: 0,
84
85            header,
86        }
87    }
88
89    /// Returns the number of records in the builder.
90    pub fn len(&self) -> usize {
91        self.rows
92    }
93
94    /// Returns whether the builder is empty.
95    pub fn is_empty(&self) -> bool {
96        self.len() == 0
97    }
98
99    /// Appends a record to the builder.
100    pub fn append(&mut self, record: &RecordBuf) -> Result<()> {
101        for col_idx in self.projection.iter() {
102            match col_idx {
103                0 => {
104                    if let Some(name) = record.name() {
105                        let name = std::str::from_utf8(name.as_ref())?;
106                        self.names.append_value(name);
107                    } else {
108                        self.names.append_null();
109                    }
110                }
111                1 => {
112                    let flag_bits = record.flags().bits();
113                    self.flags.append_value(flag_bits as i32);
114                }
115                2 => {
116                    let reference_name = match record.reference_sequence(&self.header) {
117                        Some(Ok((name, _))) => Some(std::str::from_utf8(name)?),
118                        Some(Err(_)) => None,
119                        None => None,
120                    };
121                    self.references.append_option(reference_name);
122                }
123                3 => {
124                    self.starts
125                        .append_option(record.alignment_start().map(|v| v.get() as i64));
126                }
127                4 => {
128                    self.ends
129                        .append_option(record.alignment_end().map(|v| v.get() as i64));
130                }
131                5 => {
132                    self.mapping_qualities
133                        .append_option(record.mapping_quality().map(|v| v.get().to_string()));
134                }
135                6 => {
136                    let mut cigar_to_print = Vec::new();
137
138                    // let cigar_string = cigar.iter().map(|c| c.to_string()).join("");
139                    for op_result in record.cigar().iter() {
140                        let op = op_result?;
141
142                        let kind_str = match op.kind() {
143                            Kind::Deletion => "D",
144                            Kind::Insertion => "I",
145                            Kind::HardClip => "H",
146                            Kind::SoftClip => "S",
147                            Kind::Match => "M",
148                            Kind::SequenceMismatch => "X",
149                            Kind::Skip => "N",
150                            Kind::Pad => "P",
151                            Kind::SequenceMatch => "=",
152                        };
153
154                        cigar_to_print.push(format!("{}{}", op.len(), kind_str));
155                    }
156
157                    let cigar_string = cigar_to_print.join("");
158                    self.cigar.append_value(cigar_string);
159                }
160                7 => {
161                    let mate_reference_name = match record.mate_reference_sequence(&self.header) {
162                        Some(Ok((name, _))) => Some(std::str::from_utf8(name)?),
163                        Some(Err(_)) => None,
164                        None => None,
165                    };
166                    self.mate_references.append_option(mate_reference_name);
167                }
168                8 => {
169                    let sequence = record.sequence().as_ref();
170                    self.sequences.append_value(std::str::from_utf8(sequence)?);
171                }
172                9 => {
173                    let quality_scores = record.quality_scores().as_ref();
174                    let slice_i8: &[i8] = unsafe {
175                        std::slice::from_raw_parts(
176                            quality_scores.as_ptr() as *const i8,
177                            quality_scores.len(),
178                        )
179                    };
180
181                    let slice_i64 = slice_i8.iter().map(|v| *v as i64).collect::<Vec<_>>();
182
183                    self.quality_scores.values().append_slice(&slice_i64);
184                    self.quality_scores.append(true);
185                }
186                10 => {
187                    // This is _very_ similar to BAM, may not need body any more
188                    let data = record.data();
189                    self.tags.append(data)?;
190                }
191                _ => {
192                    return Err(ArrowError::InvalidArgumentError(format!(
193                        "Invalid column index {} for SAM",
194                        col_idx
195                    )))
196                }
197            }
198        }
199
200        self.rows += 1;
201
202        Ok(())
203    }
204
205    /// Finishes the builder and returns an vector of arrays.
206    pub fn finish(&mut self) -> Vec<ArrayRef> {
207        let mut arrays: Vec<ArrayRef> = Vec::new();
208
209        for col_idx in self.projection.iter() {
210            match col_idx {
211                0 => arrays.push(Arc::new(self.names.finish())),
212                1 => arrays.push(Arc::new(self.flags.finish())),
213                2 => arrays.push(Arc::new(self.references.finish())),
214                3 => arrays.push(Arc::new(self.starts.finish())),
215                4 => arrays.push(Arc::new(self.ends.finish())),
216                5 => arrays.push(Arc::new(self.mapping_qualities.finish())),
217                6 => arrays.push(Arc::new(self.cigar.finish())),
218                7 => arrays.push(Arc::new(self.mate_references.finish())),
219                8 => arrays.push(Arc::new(self.sequences.finish())),
220                9 => arrays.push(Arc::new(self.quality_scores.finish())),
221                10 => arrays.push(Arc::new(self.tags.finish())),
222                _ => panic!("Invalid column index {} for SAM", col_idx),
223            }
224        }
225
226        arrays
227    }
228}
229
230impl ExonArrayBuilder for SAMArrayBuilder {
231    /// Finishes building the internal data structures and returns the built arrays.
232    fn finish(&mut self) -> Vec<ArrayRef> {
233        self.finish()
234    }
235
236    /// Returns the number of elements in the array.
237    fn len(&self) -> usize {
238        self.len()
239    }
240}