Skip to main content

rsomics_bed_shift/
lib.rs

1//! Shift BED coordinates by a fixed offset.
2//!
3//! Each record's start and end are shifted by `shift` (positive = right,
4//! negative = left). When a genome-sizes file is provided, coordinates are
5//! clamped to `[0, chrom_size)`. Records that shift entirely off a chromosome
6//! end are dropped (matching bedtools shift behaviour).
7//!
8//! Strand-aware mode (`-p`/`-m`): shift by `plus_shift` when strand column is
9//! `+`, by `minus_shift` when `-`, else by the general `shift`.
10//!
11//! BED header/track/browser lines pass through unchanged.
12
13use std::collections::HashMap;
14use std::fs::File;
15use std::io::{self, BufRead, BufReader, BufWriter, Write};
16use std::path::Path;
17
18use rsomics_common::{Result, RsomicsError};
19
20/// Load genome sizes file into a `chrom → size` map.
21/// Each non-comment line is `chrom<TAB>size`.
22pub fn load_genome(path: &Path) -> Result<HashMap<String, u64>> {
23    let raw = std::fs::read_to_string(path)
24        .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", path.display())))?;
25    let mut map = HashMap::new();
26    for line in raw.lines() {
27        if line.is_empty() || line.starts_with('#') {
28            continue;
29        }
30        let mut parts = line.splitn(2, '\t');
31        let chrom = parts
32            .next()
33            .ok_or_else(|| RsomicsError::InvalidInput(format!("genome: bad line: {line:?}")))?;
34        let size_str = parts
35            .next()
36            .ok_or_else(|| RsomicsError::InvalidInput(format!("genome: missing size for {chrom}")))?
37            .trim();
38        let size: u64 = size_str.parse().map_err(|_| {
39            RsomicsError::InvalidInput(format!("genome: bad size {size_str:?} for {chrom}"))
40        })?;
41        map.insert(chrom.to_owned(), size);
42    }
43    Ok(map)
44}
45
46#[inline]
47fn is_header(line: &str) -> bool {
48    line.is_empty()
49        || line.starts_with('#')
50        || line.starts_with("track")
51        || line.starts_with("browser")
52}
53
54/// Shift BED records from `reader` by `shift` bases, write to `output`.
55///
56/// `plus_shift` / `minus_shift` — per-strand overrides; set equal to `shift`
57///   to disable strand-aware mode.
58/// `genome` — when `Some`, coordinates are clamped and off-end records dropped.
59pub fn shift_reader<R: io::Read>(
60    reader: BufReader<R>,
61    shift: i64,
62    plus_shift: i64,
63    minus_shift: i64,
64    genome: Option<&HashMap<String, u64>>,
65    output: &mut dyn Write,
66) -> Result<()> {
67    let mut out = BufWriter::with_capacity(256 * 1024, output);
68    for (lineno_0, line) in reader.lines().enumerate() {
69        let line = line.map_err(RsomicsError::Io)?;
70        if is_header(&line) {
71            out.write_all(line.as_bytes()).map_err(RsomicsError::Io)?;
72            out.write_all(b"\n").map_err(RsomicsError::Io)?;
73            continue;
74        }
75        let lineno = lineno_0 + 1;
76
77        // Split into at most 6 columns; columns beyond 5 stay joined in col[5].
78        let cols: Vec<&str> = line.splitn(7, '\t').collect();
79        if cols.len() < 3 {
80            return Err(RsomicsError::InvalidInput(format!(
81                "line {lineno}: fewer than 3 fields"
82            )));
83        }
84        let chrom = cols[0];
85        let start: i64 = cols[1].parse().map_err(|_| {
86            RsomicsError::InvalidInput(format!("line {lineno}: bad start {:?}", cols[1]))
87        })?;
88        let end: i64 = cols[2].parse().map_err(|_| {
89            RsomicsError::InvalidInput(format!("line {lineno}: bad end {:?}", cols[2]))
90        })?;
91
92        // Column 5 (0-indexed 4) is the strand column in BED6+.
93        let strand = if cols.len() >= 6 { cols[5] } else { "." };
94        let s = if strand == "+" {
95            plus_shift
96        } else if strand == "-" {
97            minus_shift
98        } else {
99            shift
100        };
101
102        let new_start = start + s;
103        let new_end = end + s;
104
105        let (new_start, new_end) = if let Some(g) = genome {
106            let chrom_sz = g.get(chrom).copied().unwrap_or(i64::MAX as u64) as i64;
107            let cs = new_start.max(0);
108            let ce = new_end.min(chrom_sz);
109            if cs >= ce {
110                continue;
111            }
112            (cs, ce)
113        } else {
114            (new_start, new_end)
115        };
116
117        // Reconstruct output: first 3 cols updated, rest pass through verbatim.
118        out.write_all(chrom.as_bytes()).map_err(RsomicsError::Io)?;
119        write!(out, "\t{new_start}\t{new_end}").map_err(RsomicsError::Io)?;
120        for col in &cols[3..] {
121            out.write_all(b"\t").map_err(RsomicsError::Io)?;
122            out.write_all(col.as_bytes()).map_err(RsomicsError::Io)?;
123        }
124        out.write_all(b"\n").map_err(RsomicsError::Io)?;
125    }
126    out.flush().map_err(RsomicsError::Io)?;
127    Ok(())
128}
129
130/// Shift BED file at `path`.
131pub fn shift(
132    path: &Path,
133    shift: i64,
134    plus_shift: i64,
135    minus_shift: i64,
136    genome: Option<&HashMap<String, u64>>,
137    output: &mut dyn Write,
138) -> Result<()> {
139    let file = File::open(path)
140        .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", path.display())))?;
141    shift_reader(
142        BufReader::new(file),
143        shift,
144        plus_shift,
145        minus_shift,
146        genome,
147        output,
148    )
149}
150
151/// Same as [`shift`] but reads from stdin.
152pub fn shift_stdin(
153    shift_val: i64,
154    plus_shift: i64,
155    minus_shift: i64,
156    genome: Option<&HashMap<String, u64>>,
157    output: &mut dyn Write,
158) -> Result<()> {
159    shift_reader(
160        BufReader::new(io::stdin()),
161        shift_val,
162        plus_shift,
163        minus_shift,
164        genome,
165        output,
166    )
167}