Skip to main content

rsomics_bam_to_bed/
lib.rs

1//! Convert BAM alignments to BED6 format.
2//!
3//! ## Algorithm
4//!
5//! For each mapped alignment, emit one BED6 record:
6//! `chrom  start  end  name  score  strand`
7//!
8//! - `start` is 0-based (BED convention), sourced directly from the BAM `pos`
9//!   field which is also 0-based.
10//! - `end` = start + reference span of the alignment.
11//! - `score` is the mapping quality (MAPQ) unless `use_edit_distance` is set,
12//!   in which case the NM auxiliary tag value is used.
13//! - `strand` is `+` for forward, `-` for reverse-complemented reads.
14//!
15//! With `split`, any spliced alignment (containing N CIGAR operations) is
16//! broken into individual exon blocks — one BED record per block. The name and
17//! score come from the read; blocks are non-overlapping and non-adjacent.
18//!
19//! Unmapped reads (FLAG & 0x4) are always skipped.
20//!
21//! ## Reference
22//!
23//! `BEDTools` bamtobed — Quinlan & Hall (2010). Bioinformatics 26(6): 841–842.
24//! DOI: 10.1093/bioinformatics/btq033
25
26use std::io::{BufWriter, Write};
27use std::num::NonZero;
28use std::path::Path;
29
30use rsomics_bamio::raw::{self, RawRecord};
31use rsomics_common::{Result, RsomicsError};
32
33// BAM CIGAR op codes (packed low-nibble encoding).
34const OP_MATCH: u8 = 0; // M
35const OP_DEL: u8 = 2; // D
36const OP_SKIP: u8 = 3; // N  (intron splice)
37const OP_EQ: u8 = 7; // =
38const OP_DIFF: u8 = 8; // X
39
40/// Returns the number of reference bases consumed by `op`.
41#[inline]
42fn ref_len(op: u8, len: u32) -> u64 {
43    match op {
44        OP_MATCH | OP_DEL | OP_SKIP | OP_EQ | OP_DIFF => u64::from(len),
45        _ => 0,
46    }
47}
48
49#[derive(Default)]
50pub struct BamToBedOpts {
51    /// Split spliced alignments (N in CIGAR) into per-exon records.
52    pub split: bool,
53    /// Use NM tag value as BED score instead of MAPQ.
54    pub use_edit_distance: bool,
55    /// Append CIGAR string as a 7th column.
56    pub cigar: bool,
57}
58
59/// A single BED record (reusable, reduces per-record allocation).
60struct Bed6<'a> {
61    chrom: &'a str,
62    start: u64,
63    end: u64,
64    name: &'a [u8],
65    score: u32,
66    strand: u8,
67    cigar: Option<&'a str>,
68}
69
70impl Bed6<'_> {
71    fn write(&self, out: &mut impl Write) -> Result<()> {
72        let mut ib = itoa::Buffer::new();
73        out.write_all(self.chrom.as_bytes())
74            .map_err(RsomicsError::Io)?;
75        out.write_all(b"\t").map_err(RsomicsError::Io)?;
76        out.write_all(ib.format(self.start).as_bytes())
77            .map_err(RsomicsError::Io)?;
78        out.write_all(b"\t").map_err(RsomicsError::Io)?;
79        out.write_all(ib.format(self.end).as_bytes())
80            .map_err(RsomicsError::Io)?;
81        out.write_all(b"\t").map_err(RsomicsError::Io)?;
82        out.write_all(self.name).map_err(RsomicsError::Io)?;
83        out.write_all(b"\t").map_err(RsomicsError::Io)?;
84        out.write_all(ib.format(self.score).as_bytes())
85            .map_err(RsomicsError::Io)?;
86        out.write_all(b"\t").map_err(RsomicsError::Io)?;
87        out.write_all(&[self.strand]).map_err(RsomicsError::Io)?;
88        if let Some(c) = self.cigar {
89            out.write_all(b"\t").map_err(RsomicsError::Io)?;
90            out.write_all(c.as_bytes()).map_err(RsomicsError::Io)?;
91        }
92        out.write_all(b"\n").map_err(RsomicsError::Io)?;
93        Ok(())
94    }
95}
96
97pub fn bam_to_bed(
98    input: &Path,
99    output: &mut dyn Write,
100    opts: &BamToBedOpts,
101    workers: NonZero<usize>,
102) -> Result<u64> {
103    let mut reader = rsomics_bamio::open_with_workers(input, workers)?;
104    let header = reader.read_header().map_err(RsomicsError::Io)?;
105    let ref_names: Vec<String> = header
106        .reference_sequences()
107        .keys()
108        .map(ToString::to_string)
109        .collect();
110
111    let mut out = BufWriter::with_capacity(256 * 1024, output);
112    let mut record = RawRecord::default();
113    let mut count: u64 = 0;
114
115    while raw::read_record(reader.get_mut(), &mut record)? != 0 {
116        let flags = record.flags();
117
118        // Skip unmapped reads.
119        if flags & 0x4 != 0 {
120            continue;
121        }
122
123        let tid = record.reference_sequence_id();
124        let pos0 = record.alignment_start(); // 0-based BAM pos
125        if tid < 0 || pos0 < 0 {
126            continue;
127        }
128
129        let chrom = usize::try_from(tid)
130            .ok()
131            .and_then(|i| ref_names.get(i))
132            .map_or("*", String::as_str);
133        let name = build_name(record.name(), flags);
134        let strand = if flags & 0x10 != 0 { b'-' } else { b'+' };
135        let score: u32 = if opts.use_edit_distance {
136            nm_tag(&record)
137        } else {
138            u32::from(record.mapping_quality())
139        };
140        let cigar_str: Option<String> = if opts.cigar {
141            Some(cigar_to_string(record.cigar_ops()))
142        } else {
143            None
144        };
145
146        if opts.split {
147            let ctx = SplitContext {
148                chrom,
149                name: &name,
150                score,
151                strand,
152                cigar_str: cigar_str.as_deref(),
153            };
154            count += emit_split_blocks(&record, &ctx, u64::try_from(pos0).unwrap_or(0), &mut out)?;
155        } else {
156            let ref_span: u64 = record.cigar_ops().map(|(op, len)| ref_len(op, len)).sum();
157            let start = u64::try_from(pos0).unwrap_or(0);
158            let end = start + ref_span;
159            Bed6 {
160                chrom,
161                start,
162                end,
163                name: &name,
164                score,
165                strand,
166                cigar: cigar_str.as_deref(),
167            }
168            .write(&mut out)?;
169            count += 1;
170        }
171    }
172
173    out.flush().map_err(RsomicsError::Io)?;
174    Ok(count)
175}
176
177struct SplitContext<'a> {
178    chrom: &'a str,
179    name: &'a [u8],
180    score: u32,
181    strand: u8,
182    cigar_str: Option<&'a str>,
183}
184
185/// Emit one BED record per exon block (CIGAR runs split at N ops).
186fn emit_split_blocks(
187    record: &RawRecord,
188    ctx: &SplitContext<'_>,
189    pos0: u64,
190    out: &mut impl Write,
191) -> Result<u64> {
192    let SplitContext {
193        chrom,
194        name,
195        score,
196        strand,
197        cigar_str,
198    } = ctx;
199    let mut ref_cursor = pos0;
200    let mut block_start = pos0;
201    let mut in_block = false;
202    let mut count: u64 = 0;
203
204    for (op, len) in record.cigar_ops() {
205        let rlen = u64::from(len);
206        match op {
207            OP_SKIP => {
208                // N: splice junction — emit the accumulated exon block.
209                if in_block {
210                    Bed6 {
211                        chrom,
212                        start: block_start,
213                        end: ref_cursor,
214                        name,
215                        score: *score,
216                        strand: *strand,
217                        cigar: None,
218                    }
219                    .write(out)?;
220                    count += 1;
221                    in_block = false;
222                }
223                ref_cursor += rlen;
224                block_start = ref_cursor;
225            }
226            OP_MATCH | OP_DEL | OP_EQ | OP_DIFF => {
227                if !in_block {
228                    block_start = ref_cursor;
229                    in_block = true;
230                }
231                ref_cursor += rlen;
232            }
233            _ => {}
234        }
235    }
236
237    // Emit the final block.
238    if in_block {
239        Bed6 {
240            chrom,
241            start: block_start,
242            end: ref_cursor,
243            name,
244            score: *score,
245            strand: *strand,
246            cigar: *cigar_str,
247        }
248        .write(out)?;
249        count += 1;
250    }
251
252    Ok(count)
253}
254
255/// Append `/1` or `/2` mate suffix to paired reads, matching bedtools bamtobed.
256///
257/// bedtools bamtobed emits the query name with `/1` for the first segment and
258/// `/2` for the second segment of a paired-end read. Single-end reads and reads
259/// already carrying a suffix (rare) get no modification.
260fn build_name(raw: &[u8], flags: u16) -> Vec<u8> {
261    let is_paired = flags & 0x1 != 0;
262    if !is_paired {
263        return raw.to_vec();
264    }
265    let suffix: &[u8] = if flags & 0x40 != 0 { b"/1" } else { b"/2" };
266    let mut name = Vec::with_capacity(raw.len() + 2);
267    name.extend_from_slice(raw);
268    name.extend_from_slice(suffix);
269    name
270}
271
272/// Read the NM (edit distance) auxiliary tag, returning 0 if absent.
273fn nm_tag(record: &RawRecord) -> u32 {
274    record
275        .aux_value(*b"NM")
276        .and_then(|val| {
277            let type_code = record.aux_type(*b"NM")?;
278            parse_int_aux(type_code, val)
279        })
280        .unwrap_or(0)
281}
282
283fn parse_int_aux(type_code: u8, val: &[u8]) -> Option<u32> {
284    match type_code {
285        // Signed byte: reinterpret bits then cast to u32 (edit distance is always >= 0).
286        b'c' => val
287            .first()
288            .map(|&v| i32::from(v.cast_signed()).cast_unsigned()),
289        b'C' => val.first().copied().map(u32::from),
290        b's' => val
291            .get(..2)
292            .map(|b| i32::from(i16::from_le_bytes([b[0], b[1]])).cast_unsigned()),
293        b'S' => val
294            .get(..2)
295            .map(|b| u32::from(u16::from_le_bytes([b[0], b[1]]))),
296        b'i' => val
297            .get(..4)
298            .map(|b| i32::from_le_bytes([b[0], b[1], b[2], b[3]]).cast_unsigned()),
299        b'I' => val
300            .get(..4)
301            .map(|b| u32::from_le_bytes([b[0], b[1], b[2], b[3]])),
302        _ => None,
303    }
304}
305
306const CIGAR_CHARS: &[u8] = b"MIDNSHP=X";
307
308fn cigar_to_string(ops: impl Iterator<Item = (u8, u32)>) -> String {
309    let mut s = String::new();
310    let mut ib = itoa::Buffer::new();
311    for (op, len) in ops {
312        s.push_str(ib.format(len));
313        s.push(CIGAR_CHARS.get(op as usize).copied().unwrap_or(b'?') as char);
314    }
315    s
316}