Skip to main content

rsomics_bed_split/
lib.rs

1//! Split a BED file into N approximately equal parts.
2//!
3//! Two algorithms (matching bedtools split v2.31.1):
4//!
5//! **size** (default) — greedy bin-packing: sort records by interval length
6//! descending, assign each to the bin currently holding the fewest bases.
7//! Within each output file, records appear in descending length order (the
8//! processing order).  Produces output files whose total base-pair content is
9//! balanced.
10//!
11//! **simple** — round-robin: records are routed to files 1, 2, …, N, 1, 2, …
12//! in input order.  Records within each file preserve input order.  Produces
13//! files with approximately equal record counts.
14//!
15//! Output files are written to `<prefix>.NNNNN.bed` (zero-padded to 5 digits).
16//! A summary line `<filename>\t<bases>\t<records>` is written to stderr for
17//! each output file, matching bedtools' exact format.
18//!
19//! Complexity: O(N) for `simple`; O(R log F) for `size` where R is record
20//! count and F is the number of output files (min-heap over F bins).
21
22use std::cmp::Reverse;
23use std::collections::BinaryHeap;
24use std::fs::File;
25use std::io::{BufRead, BufReader, BufWriter, Write};
26use std::path::Path;
27
28use rsomics_common::{Result, RsomicsError};
29
30#[derive(Clone, Copy, PartialEq, Eq, Debug)]
31pub enum Algorithm {
32    Size,
33    Simple,
34}
35
36/// Split `input` BED into `n` files with the given `prefix`.
37///
38/// Returns `Vec<(filename, total_bases, record_count)>` in file-index order,
39/// matching the stderr summary bedtools emits.
40pub fn split(
41    input: &Path,
42    n: usize,
43    prefix: &str,
44    algorithm: Algorithm,
45) -> Result<Vec<(String, u64, u64)>> {
46    if n == 0 {
47        return Err(RsomicsError::InvalidInput(
48            "number of output files must be >= 1".into(),
49        ));
50    }
51
52    let records = read_records(input)?;
53
54    // `write_order[bin]` = indices into `records` in the order they should be written.
55    let write_order = match algorithm {
56        Algorithm::Size => assign_size(&records, n),
57        Algorithm::Simple => assign_simple(records.len(), n),
58    };
59
60    write_files(&records, &write_order, n, prefix)
61}
62
63// ── record I/O ────────────────────────────────────────────────────────────────
64
65struct Record {
66    line: String,
67    len: u64,
68}
69
70fn read_records(path: &Path) -> Result<Vec<Record>> {
71    let file = File::open(path)
72        .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", path.display())))?;
73    let reader = BufReader::new(file);
74    let mut records = Vec::new();
75
76    for raw in reader.lines() {
77        let line = raw.map_err(RsomicsError::Io)?;
78        let trimmed = line.trim_end_matches('\r');
79        if trimmed.is_empty()
80            || trimmed.starts_with('#')
81            || trimmed.starts_with("track")
82            || trimmed.starts_with("browser")
83        {
84            continue;
85        }
86        let len = parse_len(trimmed)?;
87        records.push(Record {
88            line: trimmed.to_string(),
89            len,
90        });
91    }
92
93    Ok(records)
94}
95
96fn parse_len(line: &str) -> Result<u64> {
97    let mut cols = line.splitn(4, '\t');
98    let _chrom = cols
99        .next()
100        .ok_or_else(|| RsomicsError::InvalidInput(format!("bad BED line: {line}")))?;
101    let start_s = cols
102        .next()
103        .ok_or_else(|| RsomicsError::InvalidInput(format!("bad BED line (no start): {line}")))?;
104    let end_s = cols
105        .next()
106        .ok_or_else(|| RsomicsError::InvalidInput(format!("bad BED line (no end): {line}")))?;
107    let start: u64 = start_s
108        .parse()
109        .map_err(|e| RsomicsError::InvalidInput(format!("start parse '{start_s}': {e}")))?;
110    let end: u64 = end_s
111        .parse()
112        .map_err(|e| RsomicsError::InvalidInput(format!("end parse '{end_s}': {e}")))?;
113    Ok(end.saturating_sub(start))
114}
115
116// ── assignment algorithms ─────────────────────────────────────────────────────
117
118/// Greedy bin-packing (size algorithm).
119///
120/// Returns `write_order[bin]` = Vec of record indices in the order they should
121/// appear in that bin's output file (descending length order, matching
122/// bedtools' processing order).
123fn assign_size(records: &[Record], n: usize) -> Vec<Vec<usize>> {
124    let mut order: Vec<usize> = (0..records.len()).collect();
125    order.sort_unstable_by(|&a, &b| records[b].len.cmp(&records[a].len));
126
127    // Min-heap: (current_total_bases, bin_index).
128    let mut heap: BinaryHeap<Reverse<(u64, usize)>> = (0..n).map(|i| Reverse((0, i))).collect();
129
130    let mut bin_records: Vec<Vec<usize>> = vec![Vec::new(); n];
131
132    for rec_idx in order {
133        let Reverse((bases, bin)) = heap.pop().unwrap();
134        bin_records[bin].push(rec_idx);
135        heap.push(Reverse((bases + records[rec_idx].len, bin)));
136    }
137
138    bin_records
139}
140
141/// Round-robin assignment (simple algorithm).
142///
143/// Returns `write_order[bin]` = Vec of record indices in input order.
144fn assign_simple(record_count: usize, n: usize) -> Vec<Vec<usize>> {
145    let mut bins: Vec<Vec<usize>> = vec![Vec::new(); n];
146    for i in 0..record_count {
147        bins[i % n].push(i);
148    }
149    bins
150}
151
152// ── output ────────────────────────────────────────────────────────────────────
153
154fn write_files(
155    records: &[Record],
156    write_order: &[Vec<usize>],
157    n: usize,
158    prefix: &str,
159) -> Result<Vec<(String, u64, u64)>> {
160    let filenames: Vec<String> = (1..=n).map(|i| format!("{prefix}.{i:05}.bed")).collect();
161
162    let mut summary: Vec<(String, u64, u64)> = Vec::with_capacity(n);
163
164    for (bin, name) in filenames.iter().enumerate() {
165        let f = File::create(name)
166            .map_err(|e| RsomicsError::InvalidInput(format!("Cannot open \"{name}\". {e}")))?;
167        let mut w = BufWriter::new(f);
168        let mut bases: u64 = 0;
169        let mut count: u64 = 0;
170        for &idx in &write_order[bin] {
171            writeln!(w, "{}", records[idx].line).map_err(RsomicsError::Io)?;
172            bases += records[idx].len;
173            count += 1;
174        }
175        w.flush().map_err(RsomicsError::Io)?;
176        summary.push((name.clone(), bases, count));
177    }
178
179    let stderr = std::io::stderr();
180    let mut err = stderr.lock();
181    for (name, b, c) in &summary {
182        writeln!(err, "{name}\t{b}\t{c}").map_err(RsomicsError::Io)?;
183    }
184
185    Ok(summary)
186}