genomicframe_core/formats/bed/
reader.rs1use crate::core::GenomicRecordIterator;
54use crate::error::{Error, Result};
55use crate::io::Compression;
56use flate2::read::MultiGzDecoder;
57use std::fs::File;
58use std::io::{BufRead, BufReader};
59use std::path::Path;
60
61#[derive(Debug, Clone, PartialEq)]
65pub struct BedRecord {
66 pub chrom: String,
68
69 pub start: u64,
71
72 pub end: u64,
74
75 pub name: Option<String>,
77
78 pub score: Option<f64>,
80
81 pub strand: Option<char>,
83
84 pub thick_start: Option<u64>,
86
87 pub thick_end: Option<u64>,
89
90 pub item_rgb: Option<String>,
92
93 pub block_count: Option<u32>,
95
96 pub block_sizes: Option<Vec<u32>>,
98
99 pub block_starts: Option<Vec<u32>>,
101}
102
103impl BedRecord {
104 pub fn len(&self) -> u64 {
106 self.end.saturating_sub(self.start)
107 }
108
109 pub fn is_empty(&self) -> bool {
111 self.start >= self.end
112 }
113
114 pub fn strand_str(&self) -> &str {
116 match self.strand {
117 Some('+') => "+",
118 Some('-') => "-",
119 _ => ".",
120 }
121 }
122
123 pub fn is_bed6(&self) -> bool {
125 self.name.is_some() && self.score.is_some() && self.strand.is_some()
126 }
127
128 pub fn is_bed12(&self) -> bool {
130 self.thick_start.is_some()
131 && self.thick_end.is_some()
132 && self.block_count.is_some()
133 }
134}
135
136pub struct BedReader {
138 reader: Box<dyn BufRead>,
139 line_buffer: String,
140 line_number: usize,
141}
142
143impl BedReader {
144 pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
146 let path = path.as_ref();
147 let file = File::open(path)?;
148 let compression = Compression::from_path(path);
149
150 let reader: Box<dyn BufRead> = match compression {
151 Compression::Gzip => Box::new(BufReader::new(MultiGzDecoder::new(file))),
152 _ => Box::new(BufReader::new(file)),
153 };
154
155 Ok(Self {
156 reader,
157 line_buffer: String::with_capacity(256),
158 line_number: 0,
159 })
160 }
161
162 pub fn new<R: BufRead + 'static>(reader: R) -> Self {
164 Self {
165 reader: Box::new(reader),
166 line_buffer: String::with_capacity(256),
167 line_number: 0,
168 }
169 }
170
171 fn parse_record(&mut self, line: &str) -> Result<BedRecord> {
173 let fields: Vec<&str> = line.split('\t').collect();
174
175 if fields.len() < 3 {
176 return Err(Error::Parse(format!(
177 "Line {}: BED record must have at least 3 fields (chrom, start, end), got {}",
178 self.line_number,
179 fields.len()
180 )));
181 }
182
183 let chrom = fields[0].to_string();
185 let start = fields[1].parse::<u64>().map_err(|_| {
186 Error::Parse(format!(
187 "Line {}: Invalid start position: {}",
188 self.line_number, fields[1]
189 ))
190 })?;
191 let end = fields[2].parse::<u64>().map_err(|_| {
192 Error::Parse(format!(
193 "Line {}: Invalid end position: {}",
194 self.line_number, fields[2]
195 ))
196 })?;
197
198 if start > end {
200 return Err(Error::Parse(format!(
201 "Line {}: Invalid interval: start ({}) > end ({})",
202 self.line_number, start, end
203 )));
204 }
205
206 let name = fields.get(3).map(|s| s.to_string());
208 let score = fields.get(4).and_then(|s| s.parse::<f64>().ok());
209 let strand = fields.get(5).and_then(|s| {
210 let c = s.chars().next()?;
211 if c == '+' || c == '-' || c == '.' {
212 Some(c)
213 } else {
214 None
215 }
216 });
217
218 let thick_start = fields.get(6).and_then(|s| s.parse::<u64>().ok());
220 let thick_end = fields.get(7).and_then(|s| s.parse::<u64>().ok());
221 let item_rgb = fields.get(8).map(|s| s.to_string());
222 let block_count = fields.get(9).and_then(|s| s.parse::<u32>().ok());
223
224 let block_sizes = fields.get(10).map(|s| {
225 s.trim_end_matches(',')
226 .split(',')
227 .filter_map(|x| x.parse::<u32>().ok())
228 .collect()
229 });
230
231 let block_starts = fields.get(11).map(|s| {
232 s.trim_end_matches(',')
233 .split(',')
234 .filter_map(|x| x.parse::<u32>().ok())
235 .collect()
236 });
237
238 Ok(BedRecord {
239 chrom,
240 start,
241 end,
242 name,
243 score,
244 strand,
245 thick_start,
246 thick_end,
247 item_rgb,
248 block_count,
249 block_sizes,
250 block_starts,
251 })
252 }
253}
254
255impl GenomicRecordIterator for BedReader {
256 type Record = BedRecord;
257
258 fn next_raw(&mut self) -> Result<Option<Vec<u8>>> {
259 loop {
260 self.line_buffer.clear();
261 let bytes = self.reader.read_line(&mut self.line_buffer)?;
262
263 if bytes == 0 {
264 return Ok(None);
265 }
266
267 self.line_number += 1;
268 let line = self.line_buffer.trim();
269
270 if line.is_empty() || line.starts_with('#') || line.starts_with("track") || line.starts_with("browser") {
272 continue;
273 }
274
275 return Ok(Some(line.as_bytes().to_vec()));
276 }
277 }
278
279 fn next_record(&mut self) -> Result<Option<Self::Record>> {
280 loop {
281 self.line_buffer.clear();
282 let bytes = self.reader.read_line(&mut self.line_buffer)?;
283
284 if bytes == 0 {
285 return Ok(None);
286 }
287
288 self.line_number += 1;
289 let line = self.line_buffer.trim().to_string();
290
291 if line.is_empty() || line.starts_with('#') || line.starts_with("track") || line.starts_with("browser") {
293 continue;
294 }
295
296 return Ok(Some(self.parse_record(&line)?));
297 }
298 }
299}
300
301#[cfg(test)]
302mod tests {
303 use super::*;
304
305 #[test]
306 fn test_bed3_parsing() {
307 let data = "chr1\t1000\t2000\n";
308 let mut reader = BedReader::new(data.as_bytes());
309
310 let record = reader.next_record().unwrap().unwrap();
311 assert_eq!(record.chrom, "chr1");
312 assert_eq!(record.start, 1000);
313 assert_eq!(record.end, 2000);
314 assert_eq!(record.len(), 1000);
315 assert!(record.name.is_none());
316 }
317
318 #[test]
319 fn test_bed6_parsing() {
320 let data = "chr1\t1000\t2000\tgene1\t100\t+\n";
321 let mut reader = BedReader::new(data.as_bytes());
322
323 let record = reader.next_record().unwrap().unwrap();
324 assert_eq!(record.chrom, "chr1");
325 assert_eq!(record.start, 1000);
326 assert_eq!(record.end, 2000);
327 assert_eq!(record.name, Some("gene1".to_string()));
328 assert_eq!(record.score, Some(100.0));
329 assert_eq!(record.strand, Some('+'));
330 assert!(record.is_bed6());
331 }
332
333 #[test]
334 fn test_bed12_parsing() {
335 let data = "chr1\t1000\t5000\tgene1\t100\t+\t1200\t4800\t255,0,0\t2\t1000,800\t0,3200\n";
336 let mut reader = BedReader::new(data.as_bytes());
337
338 let record = reader.next_record().unwrap().unwrap();
339 assert_eq!(record.chrom, "chr1");
340 assert_eq!(record.thick_start, Some(1200));
341 assert_eq!(record.thick_end, Some(4800));
342 assert_eq!(record.block_count, Some(2));
343 assert_eq!(record.block_sizes, Some(vec![1000, 800]));
344 assert_eq!(record.block_starts, Some(vec![0, 3200]));
345 assert!(record.is_bed12());
346 }
347
348 #[test]
349 fn test_skip_comments() {
350 let data = "# This is a comment\ntrack name=test\nchr1\t1000\t2000\n";
351 let mut reader = BedReader::new(data.as_bytes());
352
353 let record = reader.next_record().unwrap().unwrap();
354 assert_eq!(record.chrom, "chr1");
355 }
356
357 #[test]
358 fn test_invalid_interval() {
359 let data = "chr1\t2000\t1000\n"; let mut reader = BedReader::new(data.as_bytes());
361
362 assert!(reader.next_record().is_err());
363 }
364
365 #[test]
366 fn test_strand_str() {
367 let record = BedRecord {
368 chrom: "chr1".to_string(),
369 start: 1000,
370 end: 2000,
371 name: None,
372 score: None,
373 strand: Some('+'),
374 thick_start: None,
375 thick_end: None,
376 item_rgb: None,
377 block_count: None,
378 block_sizes: None,
379 block_starts: None,
380 };
381
382 assert_eq!(record.strand_str(), "+");
383 }
384}