bigbed/
lib.rs

1extern crate flate2;
2
3pub mod error;
4use crate::error::Error::{self, *};
5
6use std::io::{Read, Seek, SeekFrom, Write};
7use std::collections::VecDeque;
8use std::convert::TryInto;
9use flate2::{Decompress, FlushDecompress};
10
11
12static BIGBED_SIG: [u8; 4] = [0x87, 0x89, 0xF2, 0xEB];
13static BPT_SIG: [u8; 4] = [0x78, 0xCA, 0x8C, 0x91];
14static CIRTREE_SIG: [u8; 4] = [0x24, 0x68, 0xAC, 0xE0];
15
16
17/// a collection of useful methods for producing bytes from a type that implements Read
18pub trait ByteReader: Read {
19    fn read_u64(&mut self, big_endian: bool) -> u64 {
20        let mut bytes: [u8; 8] = [0;8];
21        self.read_exact(&mut bytes).unwrap();
22
23        if big_endian {
24            u64::from_be_bytes(bytes)
25        } else {
26            u64::from_le_bytes(bytes)
27        }
28    }
29
30    fn read_u32(&mut self, big_endian: bool) -> u32 {
31        let mut bytes: [u8; 4] = [0;4];
32        self.read_exact(&mut bytes).unwrap();
33
34        if big_endian {
35            u32::from_be_bytes(bytes)
36        } else {
37            u32::from_le_bytes(bytes)
38        }
39    }
40
41    fn read_u16(&mut self, big_endian: bool) -> u16 {
42        let mut bytes: [u8; 2] = [0;2];
43        self.read_exact(&mut bytes).unwrap();
44        if big_endian {
45            u16::from_be_bytes(bytes)
46        } else {
47            u16::from_le_bytes(bytes)
48        }
49    }
50
51    fn read_u8(&mut self) -> u8 {
52        let mut bytes: [u8; 1] = [0;1];
53        self.read_exact(&mut bytes).unwrap();
54        bytes[0]
55    }
56}
57
58impl<T: Read> ByteReader for T {}
59
60#[derive(Debug, PartialEq)]
61pub struct ZoomLevel {
62    reduction_level: u32,
63    reserved: u32,
64    data_offset: u64,
65    index_offset: u64,
66}
67
68#[derive(Debug, PartialEq)]
69pub struct FileOffsetSize{
70    offset: usize,
71    size: usize,
72}
73
74pub fn find_file_offset_gap(block_list: &[FileOffsetSize]) -> (&[FileOffsetSize], &[FileOffsetSize]) {
75    for (index, block) in block_list.iter().enumerate() {
76        let next = index + 1;
77        // find the first gap
78        if next < block_list.len()  && block_list[next].offset != block.offset + block.size {
79            return (&block_list[..next], &block_list[next..])
80        }
81    }
82    (&block_list[..], &[])
83}
84
85fn strip_null(inp: &str) -> &str {
86    let mut start = 0;
87    for (index, byte) in inp.bytes().enumerate() {
88        if start == index && byte == 0 {
89            start += 1
90        } else {
91            break
92        }
93    }
94    for (index, byte) in inp.bytes().enumerate().skip(start) {
95        if byte == 0 {
96            return &inp[start..index]
97        }
98    }
99    &inp[start..]
100}
101
102#[derive(Debug, PartialEq)]
103pub struct Chrom{
104    name: String,
105    id: u32,
106    size: u32,
107}
108
109#[derive(Debug, PartialEq)]
110pub struct BedLine {
111    chrom_id: u32,
112    start: u32,
113    end: u32,
114    rest: Option<String>,
115}
116
117#[derive(Debug)]
118struct BPlusTreeFile { 
119    big_endian: bool,
120    block_size: u32,
121    key_size: usize,
122    val_size: usize,
123    item_count: u64,
124    root_offset: u64,
125}
126
127impl BPlusTreeFile {
128    fn with_reader<T: Read + Seek>(reader: &mut T) -> Result<BPlusTreeFile, Error> {
129        // check the signature first
130        let mut buff = [0; 4];
131        reader.read_exact(&mut buff)?;
132        let big_endian =
133            if buff == BPT_SIG {
134                true
135            } else if buff.iter().eq(BPT_SIG.iter().rev()) {
136                false
137            } else {
138                return Err(Error::BadSig{expected: BPT_SIG, received: buff});
139            };
140
141        //read all the header information
142        let block_size = reader.read_u32(big_endian);
143        let key_size = reader.read_u32(big_endian).try_into()?;
144        let val_size = reader.read_u32(big_endian).try_into()?;
145        let item_count = reader.read_u64(big_endian);
146
147        // skip over the reserved region and get the root offset
148        let root_offset = reader.seek(SeekFrom::Current(8))?;
149        Ok(BPlusTreeFile{big_endian, block_size, key_size, val_size, item_count, root_offset})
150    }
151
152    //TODO: eventually abstract the traversal function as an iterator
153    fn chrom_list<T: Read + Seek>(&self, reader: &mut T) -> Result<Vec<Chrom>, Error> {
154        // move reader to the root_offset
155        let mut chroms: Vec<Chrom> = Vec::new();
156        let mut offsets = VecDeque::new();
157        offsets.push_back(self.root_offset);
158        while let Some(offset) = offsets.pop_front() {
159            // move to the offset
160            reader.seek(SeekFrom::Start(offset))?;
161            
162            // read block header
163            let is_leaf = reader.read_u8();
164            let _reserved = reader.read_u8();
165            let child_count = reader.read_u16(self.big_endian);
166            if is_leaf != 0 {
167                let mut valbuf: Vec<u8> = vec![0; self.val_size.try_into().unwrap()];
168                for _  in 0..child_count {
169                    let mut keybuf: Vec<u8> = vec![0; self.key_size.try_into().unwrap()];
170                    //TODO: move this into the declaration of the file
171                    if self.val_size != 8 {
172                        panic!("Expected chromosome data to be 8 bytes not, {}", self.val_size)
173                    }
174                    reader.read_exact(&mut keybuf)?;
175                    reader.read_exact(&mut valbuf)?;
176                    
177                    let id = if self.big_endian {
178                        u32::from_be_bytes(valbuf[0..4].try_into().unwrap())
179                    } else {
180                        u32::from_le_bytes(valbuf[0..4].try_into().unwrap())
181                    };
182                    let size = if self.big_endian {
183                        u32::from_be_bytes(valbuf[4..8].try_into().unwrap())
184                    } else {
185                        u32::from_le_bytes(valbuf[4..8].try_into().unwrap())
186                    };
187                    let chrom = Chrom{
188                        name: String::from_utf8(keybuf).unwrap(), id, size
189                    };
190                    chroms.push(chrom);
191                }
192            } else {
193                for _ in 0..child_count {
194                    // skip over the key in each block
195                    // note that keysize is typically a few bytes, so converting into 
196                    // the i32 format should not cause a panic
197                    reader.seek(SeekFrom::Current(self.key_size.try_into()?))?;
198                    // read an offset and add it to the list to traverse
199                    let offset = reader.read_u64(self.big_endian);
200                    offsets.push_back(offset);
201                }
202            }
203        }
204        Ok(chroms)
205    }
206
207    // TODO: abstract this method
208    fn find<T: Read + Seek>(&self, chrom: &str, reader: &mut T) -> Result<Option<Chrom>, Error> {
209        if chrom.len() > self.key_size {
210            return Err(Error::BadKey(chrom.to_owned(), self.key_size))
211        }
212        // if key is too short, we need to pad it with null character
213        if chrom.len() != (self.key_size) {
214            // prepare a new key
215            let mut padded_key = String::with_capacity(self.key_size);
216            padded_key.push_str(chrom);
217
218            let needed: usize = self.key_size - chrom.len();
219            for _ in 0..needed {
220                padded_key.push('\0');
221            }
222            self._find_internal(&padded_key, reader)
223        } else {
224            self._find_internal(chrom, reader)
225        }
226    }
227
228    fn _find_internal<T: Read + Seek>(&self, chrom: &str, reader: &mut T) -> Result<Option<Chrom>, Error> {
229        let mut offsets = VecDeque::new();
230        offsets.push_back(self.root_offset);
231        while let Some(offset) = offsets.pop_front() {
232            // move to the offset
233            reader.seek(SeekFrom::Start(offset))?;
234
235            // read block header
236            let is_leaf = reader.read_u8();
237            let _reserved = reader.read_u8();
238            let child_count = reader.read_u16(self.big_endian);
239            if is_leaf != 0 {
240                let mut valbuf: Vec<u8> = vec![0; self.val_size.try_into().unwrap()];
241                for _  in 0..child_count {
242                    let mut keybuf: Vec<u8> = vec![0; self.key_size.try_into().unwrap()];
243                    reader.read(&mut keybuf)?;
244                    reader.read(&mut valbuf)?;
245                    let other_key = String::from_utf8(keybuf).unwrap();
246                    if other_key == chrom {
247                        if self.val_size != 8 {
248                            panic!("Expected chromosome data to be 8 bytes not, {}", self.val_size)
249                        }
250                        let id = if self.big_endian {
251                            u32::from_be_bytes(valbuf[0..4].try_into().unwrap())
252                        } else {
253                            u32::from_le_bytes(valbuf[0..4].try_into().unwrap())
254                        };
255                        let size = if self.big_endian {
256                            u32::from_be_bytes(valbuf[4..8].try_into().unwrap())
257                        } else {
258                            u32::from_le_bytes(valbuf[4..8].try_into().unwrap())
259                        };
260                        // return the proper data
261                        return Ok(Some(Chrom{name: other_key, id, size}))
262                    }
263                }
264            } else {
265                // skip past the first key
266                reader.seek(SeekFrom::Current(self.key_size.try_into()?))?;
267                // read the offset
268                let mut prev_offset = reader.read_u64(self.big_endian);
269                for _ in 1..child_count {
270                    let mut keybuf: Vec<u8> = vec![0; self.key_size];
271                    reader.read(&mut keybuf)?;
272                    let other_key = String::from_utf8(keybuf).unwrap();
273                    // if find a bigger key, that means we passed our good key
274                    if chrom < &other_key {
275                        break;
276                    }
277                    // otherwise: read the next offset and keep going
278                    prev_offset = reader.read_u64(self.big_endian);
279                }
280                offsets.push_back(prev_offset);
281            }
282        }
283        Ok(None)
284    }
285}
286
287#[derive(Debug)]
288struct CIRTreeFile {
289    big_endian: bool,
290    block_size: u32,
291    item_count: u64,
292    start_chrom_ix: u32,
293    start_base: u32,
294    end_chrom_ix: u32,
295    end_base: u32,
296    file_size: u64,
297    items_per_slot: u32,
298    root_offset: u64,
299}
300
301fn cir_overlaps(q_chrom: u32, q_start: u32, q_end: u32, 
302                start_chrom: u32, start_base: u32, 
303                end_chrom: u32, end_base: u32) -> bool {
304    (q_chrom, q_start) < (end_chrom, end_base) 
305    && (q_chrom, q_end) > (start_chrom, start_base)
306}
307
308impl CIRTreeFile {
309    fn with_reader<T: Read + Seek>(reader: &mut T) -> Result<CIRTreeFile, Error> {
310        // check the signature first
311        let mut buff = [0; 4];
312        reader.read_exact(&mut buff)?;
313        let big_endian =
314            if buff == CIRTREE_SIG {
315                true
316            } else if buff.iter().eq(CIRTREE_SIG.iter().rev()) {
317                false
318            } else {
319                return Err(Error::BadSig{expected: CIRTREE_SIG, received: buff});
320            };
321
322        //read all the header information
323        let block_size = reader.read_u32(big_endian);
324        let item_count = reader.read_u64(big_endian);
325        let start_chrom_ix = reader.read_u32(big_endian);
326        let start_base = reader.read_u32(big_endian);
327        let end_chrom_ix = reader.read_u32(big_endian);
328        let end_base = reader.read_u32(big_endian);
329        let file_size = reader.read_u64(big_endian);
330        let items_per_slot = reader.read_u32(big_endian);
331
332        // skip over the reserved region and get the root offset
333        let root_offset = reader.seek(SeekFrom::Current(4))?;
334
335        Ok(CIRTreeFile{
336            big_endian,
337            block_size,
338            item_count,
339            start_chrom_ix,
340            start_base,
341            end_chrom_ix,
342            end_base,
343            file_size,
344            items_per_slot,
345            root_offset,
346        })
347    }
348
349    fn find_blocks<T: Read + Seek>(&self, chrom_id: u32, start: u32, end: u32, reader: &mut T) -> Result<Vec<FileOffsetSize>, Error> {
350        let mut blocks = Vec::<FileOffsetSize>::new();
351        let mut offsets = VecDeque::new();
352        offsets.push_back(self.root_offset);
353        while let Some(offset) = offsets.pop_front() {
354            // move to the offset
355            reader.seek(SeekFrom::Start(offset))?;
356            
357            // read block header
358            let is_leaf = reader.read_u8();
359            let _reserved = reader.read_u8();
360            let child_count = reader.read_u16(self.big_endian);
361
362            if is_leaf != 0 {
363                for _  in 0..child_count {
364                    let start_chrom = reader.read_u32(self.big_endian);
365                    let start_base = reader.read_u32(self.big_endian);
366                    let end_chrom = reader.read_u32(self.big_endian);
367                    let end_base = reader.read_u32(self.big_endian);
368                    let offset = reader.read_u64(self.big_endian).try_into()?;
369                    let size = reader.read_u64(self.big_endian).try_into()?;
370                    //eprint!("chrom_id {}; start {}; end {}; start_chrom {}; start_base {}; end_chrom {}; end_base {};",
371                    //          chrom_id, start, end, start_chrom, start_base, end_chrom, end_base);
372                    if cir_overlaps(chrom_id, start, end, start_chrom, start_base, end_chrom, end_base) {
373                        blocks.push(FileOffsetSize{offset, size})
374                    }
375                }
376            } else {
377                for _ in 0..child_count {
378                    // load the data in the Node
379                    let start_chrom = reader.read_u32(self.big_endian);
380                    let start_base = reader.read_u32(self.big_endian);
381                    let end_chrom = reader.read_u32(self.big_endian);
382                    let end_base = reader.read_u32(self.big_endian);
383                    let offset = reader.read_u64(self.big_endian);
384
385                    // if we have overlaps in this area, then we should explore the node
386                    //eprint!("chrom_id {}; start {}; end {}; start_chrom {}; start_base {}; end_chrom {}; end_base {};",
387                    //         chrom_id, start, end, start_chrom, start_base, end_chrom, end_base);
388                    if cir_overlaps(chrom_id, start, end, start_chrom, start_base, end_chrom, end_base) {
389                        offsets.push_back(offset);
390                    }
391                }
392            }
393        }
394        Ok(blocks)
395    }
396}
397
398#[derive(Debug)]
399pub struct BigBed<T: Read + Seek>  {
400    reader: T,
401    pub big_endian: bool,
402    pub version: u16,
403    pub zoom_levels: u16,
404    pub chrom_tree_offset: u64,
405    pub unzoomed_data_offset: u64,
406    pub unzoomed_index_offset: u64,
407    pub field_count: u16,
408    pub defined_field_count: u16,
409    pub as_offset: u64,
410    pub total_summary_offset: u64,
411    pub uncompress_buf_size: usize,
412    pub extension_offset: u64,
413    pub level_list: Vec<ZoomLevel>,
414    pub extension_size: Option<u16>,
415    pub extra_index_count: Option<u16>,
416    pub extra_index_list_offset: Option<u64>,
417    chrom_bpt: BPlusTreeFile,
418    unzoomed_cir: Option<CIRTreeFile>,
419}
420
421impl<T: Read + Seek> BigBed<T> {
422    pub fn from_file(mut reader: T) -> Result<BigBed<T>, Error> {
423        let mut buff = [0; 4];
424        reader.read_exact(&mut buff)?;
425        let big_endian =
426            if buff == BIGBED_SIG {
427                true
428            } else if buff.iter().eq(BIGBED_SIG.iter().rev()) {
429                false
430            } else {
431                return Err(Error::BadSig{expected: BIGBED_SIG, received: buff});
432            };
433        let version = reader.read_u16(big_endian);
434        let zoom_levels = reader.read_u16(big_endian);
435        let chrom_tree_offset = reader.read_u64(big_endian);
436        let unzoomed_data_offset = reader.read_u64(big_endian);
437        let unzoomed_index_offset = reader.read_u64(big_endian);
438        let field_count = reader.read_u16(big_endian);
439        let defined_field_count = reader.read_u16(big_endian);
440        let as_offset = reader.read_u64(big_endian);
441        let total_summary_offset = reader.read_u64(big_endian);
442        let uncompress_buf_size = reader.read_u32(big_endian).try_into()?;
443        let extension_offset = reader.read_u64(big_endian);
444
445        let mut level_list: Vec<ZoomLevel> = Vec::with_capacity(usize::from(zoom_levels));
446        for _ in 0..usize::from(zoom_levels) {
447            level_list.push(ZoomLevel{
448                reduction_level: reader.read_u32(big_endian),
449                reserved: reader.read_u32(big_endian),
450                data_offset: reader.read_u64(big_endian),
451                index_offset: reader.read_u64(big_endian)
452            })
453        }
454
455        let mut extension_size = None;
456        let mut extra_index_count = None;
457        let mut extra_index_list_offset = None;
458
459        if extension_offset != 0 {
460            // move to extension
461            reader.seek(SeekFrom::Start(extension_offset))?;
462            extension_size = Some(reader.read_u16(big_endian));
463            extra_index_count = Some(reader.read_u16(big_endian));
464            extra_index_list_offset = Some(reader.read_u64(big_endian));
465        }
466
467        //move to the B+ tree file region
468        reader.seek(SeekFrom::Start(chrom_tree_offset))?;
469        let chrom_bpt = BPlusTreeFile::with_reader(&mut reader)?;
470
471        Ok(BigBed{
472            reader, big_endian, version, zoom_levels, chrom_tree_offset, 
473            unzoomed_data_offset, unzoomed_index_offset, field_count,
474            defined_field_count, as_offset, total_summary_offset, 
475            uncompress_buf_size, extension_offset, level_list,
476            extension_size, extra_index_count, extra_index_list_offset,
477            chrom_bpt, unzoomed_cir: None,
478        })
479    }
480    
481    pub fn attach_unzoomed_cir(&mut self) -> Result<(), Error>{
482        if self.unzoomed_cir.is_none() {
483            // if not, seek to where the reader should be
484            self.reader.seek(SeekFrom::Start(self.unzoomed_index_offset))?;
485            // and attach the index (i.e. read the header)
486            self.unzoomed_cir = Some(
487                CIRTreeFile::with_reader(&mut self.reader)?
488            );
489        }
490        Ok(())
491    }
492    
493    pub fn overlapping_blocks(&mut self, chrom_id: u32, 
494                          start: u32, end: u32) -> Result<Vec<FileOffsetSize>, Error> {
495        
496        // ensure that unzoomed_cir is attached
497        self.attach_unzoomed_cir()?;
498        // this operation is guaranteed to work now
499        let index = self.unzoomed_cir.as_ref().unwrap();
500        Ok(index.find_blocks(chrom_id, start, end, &mut self.reader)?)
501    }
502 
503    pub fn query(&mut self, chrom: &str, start: u32, end: u32, max_items: u32) -> Result<Vec<BedLine>, Error> {
504        let mut lines: Vec<BedLine> = Vec::new();
505        let mut item_count: u32 = 0;
506
507        let chrom_id: Option<u32>;
508        // search for the chrom_id
509        if let Some(chrom_data) = self.find_chrom(chrom)? {
510            chrom_id = Some(chrom_data.id);
511        // search for chrom_id without the 'chr'
512        } else if let Some(chrom_data) = self.find_chrom(&chrom[3..])? {
513            chrom_id = Some(chrom_data.id);
514        } else {
515            return Err(BadChrom(chrom.to_owned()));
516        }
517        // this operation is safe, otherwise the return above will be invoked
518        let chrom_id = chrom_id.unwrap();
519        // from kent:
520        // "Find blocks with padded start and end to make sure we include zero-length insertions"
521        let padded_start = if start > 0 {start - 1} else {start};
522        let padded_end = end + 1;
523        let blocks = self.overlapping_blocks(chrom_id, padded_start, padded_end)?;
524        
525        let mut decompressor = None;
526        let mut decom_buff = None;
527        if self.uncompress_buf_size > 0 {
528            decompressor = Some(Decompress::new(true));
529            decom_buff = Some(vec![0u8; self.uncompress_buf_size]);
530        }
531
532        let mut remaining = &blocks[..];
533        while remaining.len() > 0 {
534            // iterate through the list of blocks, get a slice of contiguous blocks
535            let split = find_file_offset_gap(remaining);
536            let before_gap = split.0;
537            remaining = split.1;
538
539            // get the offset
540            let merged_offset = before_gap[0].offset;
541            // get the total size
542            // note: these unwraps are safe because we must have at least one element
543            // (otherwise the loop would terminate)
544            let merged_size = before_gap.last().unwrap().offset + before_gap.last().unwrap().size - merged_offset;
545            // read in all the contigious blocks
546            let mut merged_buff: Vec<u8> = vec![0; merged_size as usize];
547            self.reader.seek(SeekFrom::Start(merged_offset.try_into()?))?;
548            self.reader.read_exact(&mut merged_buff)?;
549            
550            
551            // for each block in the merged group
552            for block in before_gap {
553                let mut index: usize = 0;
554                let block_start = block.offset - merged_offset;
555                let mut block_end = block_start + block.size;
556                let mut buff = &merged_buff[block_start..block_end];
557                if self.uncompress_buf_size > 0 {
558                    let debuff =  decom_buff.as_mut().unwrap();
559                    let decomp =  decompressor.as_mut().unwrap();
560                    let status = decomp.decompress(&buff, debuff, FlushDecompress::Finish)?;
561                    match status {
562                        flate2::Status::Ok | flate2::Status::StreamEnd => {}
563                        _ => {
564                            eprintln!("{:?}", status);
565                            return Err(Error::Misc("Decompression error!"));
566                        }
567                    }
568                    block_end = decomp.total_out() as usize;
569                    decomp.reset(true);
570                    buff = &*debuff;
571                }
572                // iterate over the individual bytes in this block
573                while index < block_end {
574                    // read in chrom_id
575                    let bytes: [u8; 4] = buff[index..index+4].try_into().expect("Failed to convert bytes");
576                    let chr = if self.big_endian {u32::from_be_bytes(bytes)} else {u32::from_le_bytes(bytes)};
577                    index += 4;
578                    // read in start
579                    let bytes: [u8; 4] = buff[index..index+4].try_into().expect("Failed to convert bytes");
580                    let s = if self.big_endian {u32::from_be_bytes(bytes)} else {u32::from_le_bytes(bytes)};
581                    index += 4;
582                    // read in end
583                    let bytes: [u8; 4] = buff[index..index+4].try_into().expect("Failed to convert bytes");
584                    let e = if self.big_endian {u32::from_be_bytes(bytes)} else {u32::from_le_bytes(bytes)};
585                    index += 4;
586
587                    // calculate how much data is left (if any)
588                    // find the next '\0' character
589                    let mut rest_length = 0;
590                    for (index, byte) in buff[index..block_end].iter().enumerate() {
591                        if byte == &0 {
592                            rest_length = index;
593                            break;
594                        }
595                    }
596                    // check if this data is in the correct range
597                    if chr == chrom_id && ( (s < end && e > start) || (s == e && (s == end || end == start) )) {
598                        item_count += 1;
599                        if max_items > 0 && item_count > max_items {
600                            break;
601                        }
602                        // get the rest of the data if it is present
603                        let rest = if rest_length > 0 {
604                            Some(String::from_utf8(buff[index..rest_length+index].to_vec()).expect("FUCK"))
605                        } else {
606                            None
607                        };
608                        // add the BedLine to the list
609                        lines.push(BedLine{
610                            chrom_id: chr,
611                            start: s,
612                            end: e,
613                            rest
614                        });
615                    }
616                    // rest_length + 1 will be at the null character
617                    index += rest_length + 1;
618                }
619                // propagate the break statement
620                if max_items > 0 && item_count > max_items {
621                    break;
622                }
623            }
624            if max_items > 0 && item_count > max_items {
625                break;
626            }
627        }
628        Ok(lines)
629    }
630
631    pub fn write_bed(&mut self, chrom: Option<&str>, start: Option<u32>, end: Option<u32>, max_items: Option<u32>, mut output: impl Write) -> Result<(), Error> {
632        let item_count = 0;
633        for chrom_data in self.chrom_list()? {
634            //TODO: check for null characters
635            if let Some(name) = chrom {
636                if name != strip_null(&chrom_data.name) {
637                    continue
638                }
639            }
640            let start = match start {
641                None => 0,
642                Some(value) => value,
643            };
644            let end = match end {
645                None => chrom_data.size,
646                Some(value) => value,
647            };
648            // check on the total number of items
649            let mut items_left = 0;
650            if let Some(max_value) = max_items {
651                items_left = max_value - item_count;
652                // stop iteration if we have exceeded the limit
653                if items_left <= 0 {
654                    break;
655                }
656            }
657
658            let name_to_print = strip_null(&chrom_data.name);
659            let interval_list = self.query(&chrom_data.name, start, end, items_left)?;
660            for bed_line in interval_list.into_iter() {
661                match bed_line.rest {
662                    None => {
663                        output.write(format!("{}\t{}\t{}\n", name_to_print, bed_line.start, bed_line.end).as_bytes())?;
664                    } Some(data) => {
665                        output.write(format!("{}\t{}\t{}\t{}\n", name_to_print, bed_line.start, bed_line.end, data).as_bytes())?;
666                    }
667                }
668            }
669        }
670        Ok(())
671    }
672
673    
674    pub fn to_string(&mut self, chrom: Option<&str>, start: Option<u32>, end: Option<u32>, max_items: Option<u32>) -> Result<Vec<String>, Error> {
675        //TODO: use the unzoomed circle to get an item count here
676        let mut output: Vec<String> = Vec::new();
677        let item_count = 0;
678        for chrom_data in self.chrom_list()? {
679            //TODO: check for null characters
680            if let Some(name) = chrom {
681                if name != strip_null(&chrom_data.name) {
682                    continue
683                }
684            }
685            let start = match start {
686                None => 0,
687                Some(value) => value,
688            };
689            let end = match end {
690                None => chrom_data.size,
691                Some(value) => value,
692            };
693            // check on the total number of items
694            let mut items_left = 0;
695            if let Some(max_value) = max_items {
696                items_left = max_value - item_count;
697                // stop iteration if we have exceeded the limit
698                if items_left <= 0 {
699                    break;
700                }
701            }
702
703            let name_to_print = strip_null(&chrom_data.name);
704            let interval_list = self.query(&chrom_data.name, start, end, items_left)?;
705            for bed_line in interval_list.into_iter() {
706                match bed_line.rest {
707                    None => {
708                        output.push(format!("{}\t{}\t{}\n", name_to_print, bed_line.start, bed_line.end));
709                    } Some(data) => {
710                        output.push(format!("{}\t{}\t{}\t{}\n", name_to_print, bed_line.start, bed_line.end, data));
711                    }
712                }
713            }
714        }
715        Ok(output)
716    } 
717
718    pub fn chrom_list(&mut self) -> Result<Vec<Chrom>, Error> {
719        self.chrom_bpt.chrom_list(&mut self.reader)
720    }
721
722    pub fn find_chrom(&mut self, chrom: &str) -> Result<Option<Chrom>, Error> {
723        self.chrom_bpt.find(chrom, &mut self.reader)
724    }
725}
726
727#[cfg(test)]
728mod test_bb {
729    use std::fs::File;
730    use super::*;
731
732    //TODO: add testcase for nonexistent file
733    fn bb_from_file(filename: &str) -> Result<BigBed<File>, Error> {
734        BigBed::from_file(File::open(filename)?)
735    }
736
737    //test for file signatures
738    #[test]
739    fn from_file_not_bigbed() {
740        // this produces a 'File I/O error because the file is empty (no bytes can be read)
741        let result = bb_from_file("test/beds/empty.bed").unwrap_err();
742        if let Error::IOError(_) = result {
743            // do a more manual check?
744        } else {
745            panic!("Expected IOError, received {:?}", result)
746        }
747        let result = bb_from_file("test/beds/one.bed").unwrap_err();
748        assert_eq!(result, Error::BadSig{expected: BIGBED_SIG, received: [99, 104, 114, 55]});
749        let result = bb_from_file("test/notbed.png").unwrap_err();
750        assert_eq!(result, Error::BadSig{expected: BIGBED_SIG, received: [137, 80, 78, 71]});
751    }
752
753    //test a bigbed made from a one-line bed file
754    #[test]
755    fn from_file_onebed() {
756        let bb = bb_from_file("test/bigbeds/one.bb").unwrap();
757        assert_eq!(bb.as_offset, 304);
758        assert_eq!(bb.chrom_tree_offset, 628);
759        assert_eq!(bb.defined_field_count, 3);
760        assert_eq!(bb.extension_offset, 564);
761        assert_eq!(bb.extension_size, Some(64));
762        assert_eq!(bb.extra_index_count, Some(0));
763        assert_eq!(bb.extra_index_list_offset, Some(0));
764        assert_eq!(bb.field_count, 3);
765        assert_eq!(bb.big_endian, false);
766        assert_eq!(bb.total_summary_offset, 524);
767        assert_eq!(bb.uncompress_buf_size, 16384);
768        assert!(bb.unzoomed_cir.is_none());
769        assert_eq!(bb.unzoomed_data_offset, 676);
770        assert_eq!(bb.unzoomed_index_offset, 700);
771        assert_eq!(bb.version, 4);
772        assert_eq!(bb.zoom_levels, 1);
773        assert_eq!(bb.level_list, vec![
774            ZoomLevel{reduction_level: 107485656, reserved: 0, data_offset: 6904, index_offset: 6936}
775        ])
776    }
777
778    #[test]
779    fn from_file_longbed() {
780        let bb = bb_from_file("test/bigbeds/long.bb").unwrap();
781        assert_eq!(bb.as_offset, 304);
782        assert_eq!(bb.chrom_tree_offset, 628);
783        assert_eq!(bb.defined_field_count, 3);
784        assert_eq!(bb.extension_offset, 564);
785        assert_eq!(bb.extension_size, Some(64));
786        assert_eq!(bb.extra_index_count, Some(0));
787        assert_eq!(bb.extra_index_list_offset, Some(0));
788        assert_eq!(bb.field_count, 3);
789        assert_eq!(bb.big_endian, false);
790        assert_eq!(bb.total_summary_offset, 524);
791        assert_eq!(bb.uncompress_buf_size, 16384);
792        assert!(bb.unzoomed_cir.is_none());
793        assert_eq!(bb.unzoomed_data_offset, 976);
794        assert_eq!(bb.unzoomed_index_offset, 80369);
795        assert_eq!(bb.version, 4);
796        assert_eq!(bb.zoom_levels, 5);
797        assert_eq!(bb.level_list, vec![
798                    ZoomLevel{reduction_level: 2440976, reserved: 0, data_offset: 86757, index_offset: 106847},
799                    ZoomLevel{reduction_level: 9763904, reserved: 0, data_offset: 113067, index_offset: 119611},
800                    ZoomLevel{reduction_level: 39055616, reserved: 0, data_offset: 125815, index_offset: 127568},
801                    ZoomLevel{reduction_level: 156222464, reserved: 0, data_offset: 133772, index_offset: 134387},
802                    ZoomLevel{reduction_level: 624889856, reserved: 0, data_offset: 140591, index_offset: 141086}
803        ]);
804    }
805
806    #[test]
807    fn test_chrom_list() {
808        let mut bb = bb_from_file("test/bigbeds/one.bb").unwrap();
809        // should only include the chromosomes mapped in the file
810        assert_eq!(bb.chrom_list().unwrap(), vec![Chrom{name: String::from("chr7"), id: 0, size: 159345973}]);
811        // same list should be generated a second time
812        assert_eq!(bb.chrom_list().unwrap(), vec![Chrom{name: String::from("chr7"), id: 0, size: 159345973}]);
813        // should include all chromosomes
814        let mut bb = bb_from_file("test/bigbeds/long.bb").unwrap();
815        assert_eq!(bb.chrom_list().unwrap(), vec![
816            Chrom{name: String::from("chr1\0"), id: 0, size: 248956422},
817            Chrom{name: String::from("chr10"), id: 1, size: 133797422},
818            Chrom{name: String::from("chr11"), id: 2, size: 135086622},
819            Chrom{name: String::from("chr12"), id: 3, size: 133275309},
820            Chrom{name: String::from("chr13"), id: 4, size: 114364328},
821            Chrom{name: String::from("chr14"), id: 5, size: 107043718},
822            Chrom{name: String::from("chr15"), id: 6, size: 101991189},
823            Chrom{name: String::from("chr16"), id: 7, size: 90338345},
824            Chrom{name: String::from("chr17"), id: 8, size: 83257441},
825            Chrom{name: String::from("chr18"), id: 9, size: 80373285},
826            Chrom{name: String::from("chr19"), id: 10, size: 58617616},
827            Chrom{name: String::from("chr2\0"), id: 11, size: 242193529},
828            Chrom{name: String::from("chr20"), id: 12, size: 64444167},
829            Chrom{name: String::from("chr21"), id: 13, size: 46709983},
830            Chrom{name: String::from("chr22"), id: 14, size: 50818468},
831            Chrom{name: String::from("chr3\0"), id: 15, size: 198295559},
832            Chrom{name: String::from("chr4\0"), id: 16, size: 190214555},
833            Chrom{name: String::from("chr5\0"), id: 17, size: 181538259},
834            Chrom{name: String::from("chr6\0"), id: 18, size: 170805979},
835            Chrom{name: String::from("chr7\0"), id: 19, size: 159345973},
836            Chrom{name: String::from("chr8\0"), id: 20, size: 145138636},
837            Chrom{name: String::from("chr9\0"), id: 21, size: 138394717},
838            Chrom{name: String::from("chrX\0"), id: 22, size: 156040895},
839            Chrom{name: String::from("chrY\0"), id: 23, size: 57227415}
840        ]);
841        let mut bb = bb_from_file("test/bigbeds/tair10-nochr.bb").unwrap();
842        assert_eq!(bb.chrom_list().unwrap(), vec![
843            Chrom{name: String::from("1"), id: 0, size: 30427671},
844            Chrom{name: String::from("2"), id: 1, size: 19698289},
845            Chrom{name: String::from("3"), id: 2, size: 23459830},
846            Chrom{name: String::from("4"), id: 3, size: 18585056},
847            Chrom{name: String::from("5"), id: 4, size: 26975502},
848            Chrom{name: String::from("C"), id: 5, size: 154478},
849            Chrom{name: String::from("M"), id: 6, size: 366924}
850        ]);
851        let mut bb = bb_from_file("test/bigbeds/tair10.bb").unwrap();
852        assert_eq!(bb.chrom_list().unwrap(), vec![
853            Chrom{name: String::from("Chr1"), id: 0, size: 30427671},
854            Chrom{name: String::from("Chr2"), id: 1, size: 19698289},
855            Chrom{name: String::from("Chr3"), id: 2, size: 23459830},
856            Chrom{name: String::from("Chr4"), id: 3, size: 18585056},
857            Chrom{name: String::from("Chr5"), id: 4, size: 26975502},
858            Chrom{name: String::from("ChrC"), id: 5, size: 154478},
859            Chrom{name: String::from("ChrM"), id: 6, size: 366924}
860        ]);
861        // testing with an extremely large chrom.sizes file:
862        let mut bb = bb_from_file("test/bigbeds/mm10.bb").unwrap();
863        assert_eq!(bb.chrom_list().unwrap(), vec![
864            Chrom{name: String::from("chr1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 0, size: 195471971},
865            Chrom{name: String::from("chr10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 1, size: 130694993},
866            Chrom{name: String::from("chr11\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 2, size: 122082543},
867            Chrom{name: String::from("chr12\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 3, size: 120129022},
868            Chrom{name: String::from("chr13\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 4, size: 120421639},
869            Chrom{name: String::from("chr14\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 5, size: 124902244},
870            Chrom{name: String::from("chr15\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 6, size: 104043685},
871            Chrom{name: String::from("chr16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 7, size: 98207768},
872            Chrom{name: String::from("chr17\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 8, size: 94987271},
873            Chrom{name: String::from("chr18\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 9, size: 90702639},
874            Chrom{name: String::from("chr19\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 10, size: 61431566},
875            Chrom{name: String::from("chr1_GL456210_random"), id: 11, size: 169725},
876            Chrom{name: String::from("chr1_GL456211_random"), id: 12, size: 241735},
877            Chrom{name: String::from("chr1_GL456212_random"), id: 13, size: 153618},
878            Chrom{name: String::from("chr1_GL456213_random"), id: 14, size: 39340},
879            Chrom{name: String::from("chr1_GL456221_random"), id: 15, size: 206961},
880            Chrom{name: String::from("chr2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 16, size: 182113224},
881            Chrom{name: String::from("chr3\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 17, size: 160039680},
882            Chrom{name: String::from("chr4\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 18, size: 156508116},
883            Chrom{name: String::from("chr4_GL456216_random"), id: 19, size: 66673},
884            Chrom{name: String::from("chr4_GL456350_random"), id: 20, size: 227966},
885            Chrom{name: String::from("chr4_JH584292_random"), id: 21, size: 14945},
886            Chrom{name: String::from("chr4_JH584293_random"), id: 22, size: 207968},
887            Chrom{name: String::from("chr4_JH584294_random"), id: 23, size: 191905},
888            Chrom{name: String::from("chr4_JH584295_random"), id: 24, size: 1976},
889            Chrom{name: String::from("chr5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 25, size: 151834684},
890            Chrom{name: String::from("chr5_GL456354_random"), id: 26, size: 195993},
891            Chrom{name: String::from("chr5_JH584296_random"), id: 27, size: 199368},
892            Chrom{name: String::from("chr5_JH584297_random"), id: 28, size: 205776},
893            Chrom{name: String::from("chr5_JH584298_random"), id: 29, size: 184189},
894            Chrom{name: String::from("chr5_JH584299_random"), id: 30, size: 953012},
895            Chrom{name: String::from("chr6\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 31, size: 149736546},
896            Chrom{name: String::from("chr7\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 32, size: 145441459},
897            Chrom{name: String::from("chr7_GL456219_random"), id: 33, size: 175968},
898            Chrom{name: String::from("chr8\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 34, size: 129401213},
899            Chrom{name: String::from("chr9\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 35, size: 124595110},
900            Chrom{name: String::from("chrM\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 36, size: 16299},
901            Chrom{name: String::from("chrUn_GL456239\0\0\0\0\0\0"), id: 37, size: 40056},
902            Chrom{name: String::from("chrUn_GL456359\0\0\0\0\0\0"), id: 38, size: 22974},
903            Chrom{name: String::from("chrUn_GL456360\0\0\0\0\0\0"), id: 39, size: 31704},
904            Chrom{name: String::from("chrUn_GL456366\0\0\0\0\0\0"), id: 40, size: 47073},
905            Chrom{name: String::from("chrUn_GL456367\0\0\0\0\0\0"), id: 41, size: 42057},
906            Chrom{name: String::from("chrUn_GL456368\0\0\0\0\0\0"), id: 42, size: 20208},
907            Chrom{name: String::from("chrUn_GL456370\0\0\0\0\0\0"), id: 43, size: 26764},
908            Chrom{name: String::from("chrUn_GL456372\0\0\0\0\0\0"), id: 44, size: 28664},
909            Chrom{name: String::from("chrUn_GL456378\0\0\0\0\0\0"), id: 45, size: 31602},
910            Chrom{name: String::from("chrUn_GL456379\0\0\0\0\0\0"), id: 46, size: 72385},
911            Chrom{name: String::from("chrUn_GL456381\0\0\0\0\0\0"), id: 47, size: 25871},
912            Chrom{name: String::from("chrUn_GL456382\0\0\0\0\0\0"), id: 48, size: 23158},
913            Chrom{name: String::from("chrUn_GL456383\0\0\0\0\0\0"), id: 49, size: 38659},
914            Chrom{name: String::from("chrUn_GL456385\0\0\0\0\0\0"), id: 50, size: 35240},
915            Chrom{name: String::from("chrUn_GL456387\0\0\0\0\0\0"), id: 51, size: 24685},
916            Chrom{name: String::from("chrUn_GL456389\0\0\0\0\0\0"), id: 52, size: 28772},
917            Chrom{name: String::from("chrUn_GL456390\0\0\0\0\0\0"), id: 53, size: 24668},
918            Chrom{name: String::from("chrUn_GL456392\0\0\0\0\0\0"), id: 54, size: 23629},
919            Chrom{name: String::from("chrUn_GL456393\0\0\0\0\0\0"), id: 55, size: 55711},
920            Chrom{name: String::from("chrUn_GL456394\0\0\0\0\0\0"), id: 56, size: 24323},
921            Chrom{name: String::from("chrUn_GL456396\0\0\0\0\0\0"), id: 57, size: 21240},
922            Chrom{name: String::from("chrUn_JH584304\0\0\0\0\0\0"), id: 58, size: 114452},
923            Chrom{name: String::from("chrX\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 59, size: 171031299},
924            Chrom{name: String::from("chrX_GL456233_random"), id: 60, size: 336933},
925            Chrom{name: String::from("chrY\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"), id: 61, size: 91744698},
926            Chrom{name: String::from("chrY_JH584300_random"), id: 62, size: 182347},
927            Chrom{name: String::from("chrY_JH584301_random"), id: 63, size: 259875},
928            Chrom{name: String::from("chrY_JH584302_random"), id: 64, size: 155838},
929            Chrom{name: String::from("chrY_JH584303_random"), id: 65, size: 158099}
930        ]);
931    }
932    
933    #[test]
934    fn test_find_chrom_one() {
935         let mut bb = bb_from_file("test/bigbeds/one.bb").unwrap();
936         assert_eq!(bb.find_chrom("chr1").unwrap(), None);
937         assert_eq!(bb.find_chrom("chr7").unwrap(), Some(Chrom{name: String::from("chr7"), id: 0, size: 159345973}));
938         // does it work again?
939         assert_eq!(bb.find_chrom("chr7").unwrap(), Some(Chrom{name: String::from("chr7"), id: 0, size: 159345973}));
940         assert_eq!(bb.find_chrom("chr").unwrap(), None);
941         // key too long
942         assert_eq!(bb.find_chrom("chr79"), Err(Error::BadKey(String::from("chr79"), 4)));
943         // should be case-sensitive
944         assert_eq!(bb.find_chrom("cHr7").unwrap(), None);
945         // near-matches don't count
946         assert_eq!(bb.find_chrom("xhr7").unwrap(), None);
947    }
948
949    #[test]
950    fn test_find_chrom_long() {
951        let mut bb = bb_from_file("test/bigbeds/long.bb").unwrap();
952        assert_eq!(bb.find_chrom("chr2\0").unwrap(), Some(Chrom{name: String::from("chr2\0"), id: 11, size: 242193529}));
953        // should work without padding
954        assert_eq!(bb.find_chrom("chr2").unwrap(), Some(Chrom{name: String::from("chr2\0"), id: 11, size: 242193529}));
955        // cannot omit the 'chr'
956        assert_eq!(bb.find_chrom("2").unwrap(), None);
957        // still should have key too long errors
958        assert_eq!(bb.find_chrom("chr2xx"), Err(Error::BadKey(String::from("chr2xx"), 5)));
959    }
960
961    #[test]
962    fn test_overlapping_blocks() {
963        let mut bb = bb_from_file("test/bigbeds/long.bb").unwrap();
964        assert_eq!(bb.overlapping_blocks(0, 100, 1000000), Ok(vec![FileOffsetSize{offset: 984, size: 3324}]));
965        // swapped start and stop positions should produce no blocks
966        assert_eq!(bb.overlapping_blocks(0, 100000, 10), Ok(vec![]));
967        // trying a more narrow range
968        assert_eq!(bb.overlapping_blocks(20, 131366255, 132257727), Ok(vec![FileOffsetSize{offset: 67045, size: 3295}]));
969        // bad chromosome should just produce no blocks
970        assert_eq!(bb.overlapping_blocks(42, 100000, 10), Ok(vec![]));
971    }
972}