1use std::io::{BufWriter, Write};
27use std::num::NonZero;
28use std::path::Path;
29
30use rsomics_bamio::raw::{self, RawRecord};
31use rsomics_common::{Result, RsomicsError};
32
33const OP_MATCH: u8 = 0; const OP_DEL: u8 = 2; const OP_SKIP: u8 = 3; const OP_EQ: u8 = 7; const OP_DIFF: u8 = 8; #[inline]
42fn ref_len(op: u8, len: u32) -> u64 {
43 match op {
44 OP_MATCH | OP_DEL | OP_SKIP | OP_EQ | OP_DIFF => u64::from(len),
45 _ => 0,
46 }
47}
48
49#[derive(Default)]
50pub struct BamToBedOpts {
51 pub split: bool,
53 pub use_edit_distance: bool,
55 pub cigar: bool,
57}
58
59struct Bed6<'a> {
61 chrom: &'a str,
62 start: u64,
63 end: u64,
64 name: &'a [u8],
65 score: u32,
66 strand: u8,
67 cigar: Option<&'a str>,
68}
69
70impl Bed6<'_> {
71 fn write(&self, out: &mut impl Write) -> Result<()> {
72 let mut ib = itoa::Buffer::new();
73 out.write_all(self.chrom.as_bytes())
74 .map_err(RsomicsError::Io)?;
75 out.write_all(b"\t").map_err(RsomicsError::Io)?;
76 out.write_all(ib.format(self.start).as_bytes())
77 .map_err(RsomicsError::Io)?;
78 out.write_all(b"\t").map_err(RsomicsError::Io)?;
79 out.write_all(ib.format(self.end).as_bytes())
80 .map_err(RsomicsError::Io)?;
81 out.write_all(b"\t").map_err(RsomicsError::Io)?;
82 out.write_all(self.name).map_err(RsomicsError::Io)?;
83 out.write_all(b"\t").map_err(RsomicsError::Io)?;
84 out.write_all(ib.format(self.score).as_bytes())
85 .map_err(RsomicsError::Io)?;
86 out.write_all(b"\t").map_err(RsomicsError::Io)?;
87 out.write_all(&[self.strand]).map_err(RsomicsError::Io)?;
88 if let Some(c) = self.cigar {
89 out.write_all(b"\t").map_err(RsomicsError::Io)?;
90 out.write_all(c.as_bytes()).map_err(RsomicsError::Io)?;
91 }
92 out.write_all(b"\n").map_err(RsomicsError::Io)?;
93 Ok(())
94 }
95}
96
97pub fn bam_to_bed(
98 input: &Path,
99 output: &mut dyn Write,
100 opts: &BamToBedOpts,
101 workers: NonZero<usize>,
102) -> Result<u64> {
103 let mut reader = rsomics_bamio::open_with_workers(input, workers)?;
104 let header = reader.read_header().map_err(RsomicsError::Io)?;
105 let ref_names: Vec<String> = header
106 .reference_sequences()
107 .keys()
108 .map(ToString::to_string)
109 .collect();
110
111 let mut out = BufWriter::with_capacity(256 * 1024, output);
112 let mut record = RawRecord::default();
113 let mut count: u64 = 0;
114
115 while raw::read_record(reader.get_mut(), &mut record)? != 0 {
116 let flags = record.flags();
117
118 if flags & 0x4 != 0 {
120 continue;
121 }
122
123 let tid = record.reference_sequence_id();
124 let pos0 = record.alignment_start(); if tid < 0 || pos0 < 0 {
126 continue;
127 }
128
129 let chrom = usize::try_from(tid)
130 .ok()
131 .and_then(|i| ref_names.get(i))
132 .map_or("*", String::as_str);
133 let name = build_name(record.name(), flags);
134 let strand = if flags & 0x10 != 0 { b'-' } else { b'+' };
135 let score: u32 = if opts.use_edit_distance {
136 nm_tag(&record)
137 } else {
138 u32::from(record.mapping_quality())
139 };
140 let cigar_str: Option<String> = if opts.cigar {
141 Some(cigar_to_string(record.cigar_ops()))
142 } else {
143 None
144 };
145
146 if opts.split {
147 let ctx = SplitContext {
148 chrom,
149 name: &name,
150 score,
151 strand,
152 cigar_str: cigar_str.as_deref(),
153 };
154 count += emit_split_blocks(&record, &ctx, u64::try_from(pos0).unwrap_or(0), &mut out)?;
155 } else {
156 let ref_span: u64 = record.cigar_ops().map(|(op, len)| ref_len(op, len)).sum();
157 let start = u64::try_from(pos0).unwrap_or(0);
158 let end = start + ref_span;
159 Bed6 {
160 chrom,
161 start,
162 end,
163 name: &name,
164 score,
165 strand,
166 cigar: cigar_str.as_deref(),
167 }
168 .write(&mut out)?;
169 count += 1;
170 }
171 }
172
173 out.flush().map_err(RsomicsError::Io)?;
174 Ok(count)
175}
176
177struct SplitContext<'a> {
178 chrom: &'a str,
179 name: &'a [u8],
180 score: u32,
181 strand: u8,
182 cigar_str: Option<&'a str>,
183}
184
185fn emit_split_blocks(
187 record: &RawRecord,
188 ctx: &SplitContext<'_>,
189 pos0: u64,
190 out: &mut impl Write,
191) -> Result<u64> {
192 let SplitContext {
193 chrom,
194 name,
195 score,
196 strand,
197 cigar_str,
198 } = ctx;
199 let mut ref_cursor = pos0;
200 let mut block_start = pos0;
201 let mut in_block = false;
202 let mut count: u64 = 0;
203
204 for (op, len) in record.cigar_ops() {
205 let rlen = u64::from(len);
206 match op {
207 OP_SKIP => {
208 if in_block {
210 Bed6 {
211 chrom,
212 start: block_start,
213 end: ref_cursor,
214 name,
215 score: *score,
216 strand: *strand,
217 cigar: None,
218 }
219 .write(out)?;
220 count += 1;
221 in_block = false;
222 }
223 ref_cursor += rlen;
224 block_start = ref_cursor;
225 }
226 OP_MATCH | OP_DEL | OP_EQ | OP_DIFF => {
227 if !in_block {
228 block_start = ref_cursor;
229 in_block = true;
230 }
231 ref_cursor += rlen;
232 }
233 _ => {}
234 }
235 }
236
237 if in_block {
239 Bed6 {
240 chrom,
241 start: block_start,
242 end: ref_cursor,
243 name,
244 score: *score,
245 strand: *strand,
246 cigar: *cigar_str,
247 }
248 .write(out)?;
249 count += 1;
250 }
251
252 Ok(count)
253}
254
255fn build_name(raw: &[u8], flags: u16) -> Vec<u8> {
261 let is_paired = flags & 0x1 != 0;
262 if !is_paired {
263 return raw.to_vec();
264 }
265 let suffix: &[u8] = if flags & 0x40 != 0 { b"/1" } else { b"/2" };
266 let mut name = Vec::with_capacity(raw.len() + 2);
267 name.extend_from_slice(raw);
268 name.extend_from_slice(suffix);
269 name
270}
271
272fn nm_tag(record: &RawRecord) -> u32 {
274 record
275 .aux_value(*b"NM")
276 .and_then(|val| {
277 let type_code = record.aux_type(*b"NM")?;
278 parse_int_aux(type_code, val)
279 })
280 .unwrap_or(0)
281}
282
283fn parse_int_aux(type_code: u8, val: &[u8]) -> Option<u32> {
284 match type_code {
285 b'c' => val
287 .first()
288 .map(|&v| i32::from(v.cast_signed()).cast_unsigned()),
289 b'C' => val.first().copied().map(u32::from),
290 b's' => val
291 .get(..2)
292 .map(|b| i32::from(i16::from_le_bytes([b[0], b[1]])).cast_unsigned()),
293 b'S' => val
294 .get(..2)
295 .map(|b| u32::from(u16::from_le_bytes([b[0], b[1]]))),
296 b'i' => val
297 .get(..4)
298 .map(|b| i32::from_le_bytes([b[0], b[1], b[2], b[3]]).cast_unsigned()),
299 b'I' => val
300 .get(..4)
301 .map(|b| u32::from_le_bytes([b[0], b[1], b[2], b[3]])),
302 _ => None,
303 }
304}
305
306const CIGAR_CHARS: &[u8] = b"MIDNSHP=X";
307
308fn cigar_to_string(ops: impl Iterator<Item = (u8, u32)>) -> String {
309 let mut s = String::new();
310 let mut ib = itoa::Buffer::new();
311 for (op, len) in ops {
312 s.push_str(ib.format(len));
313 s.push(CIGAR_CHARS.get(op as usize).copied().unwrap_or(b'?') as char);
314 }
315 s
316}