exon_sam/
array_builder.rs1use std::sync::Arc;
16
17use arrow::{
18 array::{ArrayRef, GenericListBuilder, GenericStringBuilder, Int32Builder, Int64Builder},
19 error::ArrowError,
20 error::Result,
21};
22use exon_common::ExonArrayBuilder;
23use noodles::sam::alignment::{
24 record::{cigar::op::Kind, Cigar},
25 RecordBuf,
26};
27use noodles::sam::Header;
28
29use crate::{SAMConfig, TagsBuilder};
30
31pub struct SAMArrayBuilder {
33 names: GenericStringBuilder<i32>,
34 flags: Int32Builder,
35 references: GenericStringBuilder<i32>,
36 starts: Int64Builder,
37 ends: Int64Builder,
38 mapping_qualities: GenericStringBuilder<i32>,
39 cigar: GenericStringBuilder<i32>,
40 mate_references: GenericStringBuilder<i32>,
41 sequences: GenericStringBuilder<i32>,
42 quality_scores: GenericListBuilder<i32, Int64Builder>,
43
44 tags: TagsBuilder,
45
46 projection: Vec<usize>,
47
48 rows: usize,
49
50 header: Header,
51}
52
53impl SAMArrayBuilder {
54 pub fn create(header: Header, sam_config: Arc<SAMConfig>) -> Self {
56 let tags_builder = sam_config
57 .file_schema
58 .field_with_name("tags")
59 .map_or(TagsBuilder::default(), |field| {
60 TagsBuilder::try_from(field.data_type()).unwrap()
61 });
62
63 let projection = sam_config.projection();
64
65 let quality_scores = GenericListBuilder::<i32, Int64Builder>::new(Int64Builder::new());
66
67 Self {
68 names: GenericStringBuilder::<i32>::new(),
69 flags: Int32Builder::new(),
70 references: GenericStringBuilder::<i32>::new(),
71 starts: Int64Builder::new(),
72 ends: Int64Builder::new(),
73 mapping_qualities: GenericStringBuilder::<i32>::new(),
74 cigar: GenericStringBuilder::<i32>::new(),
75 mate_references: GenericStringBuilder::<i32>::new(),
76 sequences: GenericStringBuilder::<i32>::new(),
77 quality_scores,
78
79 tags: tags_builder,
80
81 projection,
82
83 rows: 0,
84
85 header,
86 }
87 }
88
89 pub fn len(&self) -> usize {
91 self.rows
92 }
93
94 pub fn is_empty(&self) -> bool {
96 self.len() == 0
97 }
98
99 pub fn append(&mut self, record: &RecordBuf) -> Result<()> {
101 for col_idx in self.projection.iter() {
102 match col_idx {
103 0 => {
104 if let Some(name) = record.name() {
105 let name = std::str::from_utf8(name.as_ref())?;
106 self.names.append_value(name);
107 } else {
108 self.names.append_null();
109 }
110 }
111 1 => {
112 let flag_bits = record.flags().bits();
113 self.flags.append_value(flag_bits as i32);
114 }
115 2 => {
116 let reference_name = match record.reference_sequence(&self.header) {
117 Some(Ok((name, _))) => Some(std::str::from_utf8(name)?),
118 Some(Err(_)) => None,
119 None => None,
120 };
121 self.references.append_option(reference_name);
122 }
123 3 => {
124 self.starts
125 .append_option(record.alignment_start().map(|v| v.get() as i64));
126 }
127 4 => {
128 self.ends
129 .append_option(record.alignment_end().map(|v| v.get() as i64));
130 }
131 5 => {
132 self.mapping_qualities
133 .append_option(record.mapping_quality().map(|v| v.get().to_string()));
134 }
135 6 => {
136 let mut cigar_to_print = Vec::new();
137
138 for op_result in record.cigar().iter() {
140 let op = op_result?;
141
142 let kind_str = match op.kind() {
143 Kind::Deletion => "D",
144 Kind::Insertion => "I",
145 Kind::HardClip => "H",
146 Kind::SoftClip => "S",
147 Kind::Match => "M",
148 Kind::SequenceMismatch => "X",
149 Kind::Skip => "N",
150 Kind::Pad => "P",
151 Kind::SequenceMatch => "=",
152 };
153
154 cigar_to_print.push(format!("{}{}", op.len(), kind_str));
155 }
156
157 let cigar_string = cigar_to_print.join("");
158 self.cigar.append_value(cigar_string);
159 }
160 7 => {
161 let mate_reference_name = match record.mate_reference_sequence(&self.header) {
162 Some(Ok((name, _))) => Some(std::str::from_utf8(name)?),
163 Some(Err(_)) => None,
164 None => None,
165 };
166 self.mate_references.append_option(mate_reference_name);
167 }
168 8 => {
169 let sequence = record.sequence().as_ref();
170 self.sequences.append_value(std::str::from_utf8(sequence)?);
171 }
172 9 => {
173 let quality_scores = record.quality_scores().as_ref();
174 let slice_i8: &[i8] = unsafe {
175 std::slice::from_raw_parts(
176 quality_scores.as_ptr() as *const i8,
177 quality_scores.len(),
178 )
179 };
180
181 let slice_i64 = slice_i8.iter().map(|v| *v as i64).collect::<Vec<_>>();
182
183 self.quality_scores.values().append_slice(&slice_i64);
184 self.quality_scores.append(true);
185 }
186 10 => {
187 let data = record.data();
189 self.tags.append(data)?;
190 }
191 _ => {
192 return Err(ArrowError::InvalidArgumentError(format!(
193 "Invalid column index {} for SAM",
194 col_idx
195 )))
196 }
197 }
198 }
199
200 self.rows += 1;
201
202 Ok(())
203 }
204
205 pub fn finish(&mut self) -> Vec<ArrayRef> {
207 let mut arrays: Vec<ArrayRef> = Vec::new();
208
209 for col_idx in self.projection.iter() {
210 match col_idx {
211 0 => arrays.push(Arc::new(self.names.finish())),
212 1 => arrays.push(Arc::new(self.flags.finish())),
213 2 => arrays.push(Arc::new(self.references.finish())),
214 3 => arrays.push(Arc::new(self.starts.finish())),
215 4 => arrays.push(Arc::new(self.ends.finish())),
216 5 => arrays.push(Arc::new(self.mapping_qualities.finish())),
217 6 => arrays.push(Arc::new(self.cigar.finish())),
218 7 => arrays.push(Arc::new(self.mate_references.finish())),
219 8 => arrays.push(Arc::new(self.sequences.finish())),
220 9 => arrays.push(Arc::new(self.quality_scores.finish())),
221 10 => arrays.push(Arc::new(self.tags.finish())),
222 _ => panic!("Invalid column index {} for SAM", col_idx),
223 }
224 }
225
226 arrays
227 }
228}
229
230impl ExonArrayBuilder for SAMArrayBuilder {
231 fn finish(&mut self) -> Vec<ArrayRef> {
233 self.finish()
234 }
235
236 fn len(&self) -> usize {
238 self.len()
239 }
240}