bustools_core 0.16.2

Interacting with the kallisto/bus format of scRNAseq data
Documentation
use std::{io::{Read, Seek}};

use crate::{busz::{read_busz_header, utils::{setbits_u32, setbits_u64}}};


/// Each block in the compressed busz starts with a BlockHeader
/// which contains the blocksize (in bytes) and the number of records in the block.
pub (crate) struct CompressedBlockHeader {
    // the 34 most significant bits denote the size of the compressed block in bytes. 
    // The 30 least significant bits denote the number of BUS records in the block.
    header_bytes: u64  
}

impl CompressedBlockHeader {
    pub fn new(block_size_bytes: u64, block_size_records:u64) -> Self {
        let header_bytes = (block_size_bytes << 30) | block_size_records ;

        // we only have 30 bits to store the nrecords
        if (setbits_u32(30) as u64) <= block_size_records {
            panic!("Cant store more than {} records, trying {}", setbits_u32(30), block_size_records)
        }
        // we only have 34 bits to store the blocksize
        if setbits_u64(34) <= block_size_bytes {
            panic!("Cant store more than {} records, trying {}", setbits_u32(34), block_size_bytes)
        }        
        CompressedBlockHeader { header_bytes }
    }

    pub fn from_u64(u: u64) ->Self {
        Self {
            header_bytes: u
        }
    }

    /// interpret a series of 8 bytes as the header
    pub fn from_bytes(blockheader_bytes: [u8;8]) -> Option<Self> {
        if blockheader_bytes == [0,0,0,0,0,0,0,0] {  // EOF
            return None
        }
        let h = CompressedBlockHeader::from_u64(u64::from_le_bytes(blockheader_bytes));
        Some(h)
    }

    /// decodes the header bytes into blocksize and number of records
    /// folllowing the header
    pub fn get_blocksize_and_nrecords(&self) -> (u64, u64) {
        let bit_length = 30; // encoding scheme imposed by bustools
        let block_size_bytes = self.header_bytes >> bit_length;
        let bitmask_64 = setbits_u32(bit_length) as u64;
        let block_size_records = self.header_bytes & bitmask_64;
        (block_size_bytes, block_size_records)
    }

    pub fn get_header_bytes(&self) -> u64 {
        self.header_bytes
    }
}


/// takes 8bytes from `reader` and interprets them as a compressed block-header
pub (crate) fn load_block_header(reader: &mut impl Read) -> Option<CompressedBlockHeader>{
    // get block header
    let mut blockheader_bytes = [0_u8;8];
    reader.read_exact(&mut blockheader_bytes).expect("couldnt read the busz block header");
    CompressedBlockHeader::from_bytes(blockheader_bytes)
    // println!("H bytes {}, H records {}", h.get_blocksize_and_nrecords().0, h.get_blocksize_and_nrecords().1);
}

/// Stores the parameters of a single block:
/// * start of the block (not the header, but where the content starts)
/// * nbytes: number of content bytes
/// * nrecords
#[derive(Debug)]
pub (crate) struct BlockParams {
    pub (crate) start: u64,
    pub (crate) nbytes: u64,
    pub (crate) nrecords: u64
}

/// get the byte positions of all blocks in the file
pub (crate) fn get_block_starts(reader: &mut (impl Read + Seek)) -> Vec<BlockParams> {

    // eprintln!("{}", reader.stream_position().unwrap());
    let (_params, _header) = read_busz_header(reader);
    // eprintln!("{}", reader.stream_position().unwrap());

    let mut index: Vec<BlockParams> = Vec::new();
    // reader.read
    let mut counter = 0;
    while let Some(header) = load_block_header(reader) {

        let start = reader.stream_position().unwrap();
        let (nbytes, nrecords) = header.get_blocksize_and_nrecords();
        index.push(BlockParams { start, nbytes, nrecords });

        reader.seek_relative(nbytes as i64).unwrap();
        counter+=1;
        if counter > 10 {
            break
        }
    }
    index
}