1use std::cmp::Reverse;
23use std::collections::BinaryHeap;
24use std::fs::File;
25use std::io::{BufRead, BufReader, BufWriter, Write};
26use std::path::Path;
27
28use rsomics_common::{Result, RsomicsError};
29
30#[derive(Clone, Copy, PartialEq, Eq, Debug)]
31pub enum Algorithm {
32 Size,
33 Simple,
34}
35
36pub fn split(
41 input: &Path,
42 n: usize,
43 prefix: &str,
44 algorithm: Algorithm,
45) -> Result<Vec<(String, u64, u64)>> {
46 if n == 0 {
47 return Err(RsomicsError::InvalidInput(
48 "number of output files must be >= 1".into(),
49 ));
50 }
51
52 let records = read_records(input)?;
53
54 let write_order = match algorithm {
56 Algorithm::Size => assign_size(&records, n),
57 Algorithm::Simple => assign_simple(records.len(), n),
58 };
59
60 write_files(&records, &write_order, n, prefix)
61}
62
63struct Record {
66 line: String,
67 len: u64,
68}
69
70fn read_records(path: &Path) -> Result<Vec<Record>> {
71 let file = File::open(path)
72 .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", path.display())))?;
73 let reader = BufReader::new(file);
74 let mut records = Vec::new();
75
76 for raw in reader.lines() {
77 let line = raw.map_err(RsomicsError::Io)?;
78 let trimmed = line.trim_end_matches('\r');
79 if trimmed.is_empty()
80 || trimmed.starts_with('#')
81 || trimmed.starts_with("track")
82 || trimmed.starts_with("browser")
83 {
84 continue;
85 }
86 let len = parse_len(trimmed)?;
87 records.push(Record {
88 line: trimmed.to_string(),
89 len,
90 });
91 }
92
93 Ok(records)
94}
95
96fn parse_len(line: &str) -> Result<u64> {
97 let mut cols = line.splitn(4, '\t');
98 let _chrom = cols
99 .next()
100 .ok_or_else(|| RsomicsError::InvalidInput(format!("bad BED line: {line}")))?;
101 let start_s = cols
102 .next()
103 .ok_or_else(|| RsomicsError::InvalidInput(format!("bad BED line (no start): {line}")))?;
104 let end_s = cols
105 .next()
106 .ok_or_else(|| RsomicsError::InvalidInput(format!("bad BED line (no end): {line}")))?;
107 let start: u64 = start_s
108 .parse()
109 .map_err(|e| RsomicsError::InvalidInput(format!("start parse '{start_s}': {e}")))?;
110 let end: u64 = end_s
111 .parse()
112 .map_err(|e| RsomicsError::InvalidInput(format!("end parse '{end_s}': {e}")))?;
113 Ok(end.saturating_sub(start))
114}
115
116fn assign_size(records: &[Record], n: usize) -> Vec<Vec<usize>> {
124 let mut order: Vec<usize> = (0..records.len()).collect();
125 order.sort_unstable_by(|&a, &b| records[b].len.cmp(&records[a].len));
126
127 let mut heap: BinaryHeap<Reverse<(u64, usize)>> = (0..n).map(|i| Reverse((0, i))).collect();
129
130 let mut bin_records: Vec<Vec<usize>> = vec![Vec::new(); n];
131
132 for rec_idx in order {
133 let Reverse((bases, bin)) = heap.pop().unwrap();
134 bin_records[bin].push(rec_idx);
135 heap.push(Reverse((bases + records[rec_idx].len, bin)));
136 }
137
138 bin_records
139}
140
141fn assign_simple(record_count: usize, n: usize) -> Vec<Vec<usize>> {
145 let mut bins: Vec<Vec<usize>> = vec![Vec::new(); n];
146 for i in 0..record_count {
147 bins[i % n].push(i);
148 }
149 bins
150}
151
152fn write_files(
155 records: &[Record],
156 write_order: &[Vec<usize>],
157 n: usize,
158 prefix: &str,
159) -> Result<Vec<(String, u64, u64)>> {
160 let filenames: Vec<String> = (1..=n).map(|i| format!("{prefix}.{i:05}.bed")).collect();
161
162 let mut summary: Vec<(String, u64, u64)> = Vec::with_capacity(n);
163
164 for (bin, name) in filenames.iter().enumerate() {
165 let f = File::create(name)
166 .map_err(|e| RsomicsError::InvalidInput(format!("Cannot open \"{name}\". {e}")))?;
167 let mut w = BufWriter::new(f);
168 let mut bases: u64 = 0;
169 let mut count: u64 = 0;
170 for &idx in &write_order[bin] {
171 writeln!(w, "{}", records[idx].line).map_err(RsomicsError::Io)?;
172 bases += records[idx].len;
173 count += 1;
174 }
175 w.flush().map_err(RsomicsError::Io)?;
176 summary.push((name.clone(), bases, count));
177 }
178
179 let stderr = std::io::stderr();
180 let mut err = stderr.lock();
181 for (name, b, c) in &summary {
182 writeln!(err, "{name}\t{b}\t{c}").map_err(RsomicsError::Io)?;
183 }
184
185 Ok(summary)
186}