sbwt 0.4.2 - Docs.rs

//! Miscellaneous utility functions and constants used in the crate.

use std::cmp::min;
use std::io::Cursor;
use std::io::Read;
use std::ops::Range;

use bitvec::prelude::*;
use rayon::iter::ParallelBridge;
use rayon::iter::IntoParallelRefIterator;
use rayon::iter::ParallelIterator;
use simple_sds_sbwt::serialize::Serialize;

type BitVec = bitvec::vec::BitVec<u64, Lsb0>;
type BitSlice = bitvec::slice::BitSlice<u64, Lsb0>;

// Returns the number of bytes written
pub(crate) fn write_bytes<W: std::io::Write>(out: &mut W, bytes: &[u8]) -> std::io::Result<usize>{
    out.write_all(bytes)?;
    Ok(bytes.len() + 8)
}

// Searcher the range [0..n)
// Return the index of the answer, or n if does not exist
pub(crate) fn binary_search_leftmost_that_fulfills_pred<T, Access: Fn(usize) -> T, Pred: Fn(T) -> bool>(access: Access, pred: Pred, n: usize) -> usize {
    let mut ans = n;
    let mut step = n.next_power_of_two();
    while step > 0 {
        while ans as isize - step as isize >= 0 && pred(access(ans-step)) {
            ans -= step;
        }
        step /= 2;
    }
    ans
}

pub const DNA_ALPHABET: [u8; 4] = [b'A', b'C', b'G', b'T'];

// This bit vector of length 256 marks the ascii values of these characters: acgtACGT
const IS_DNA: BitArray<[u32; 8]> = bitarr![const u32, Lsb0; 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0];

pub(crate) fn is_dna(c: u8) -> bool {
    IS_DNA[c as usize]
}

// A table mapping mapping ascii A -> 0, C -> 1, G -> 2, T -> 3. Same for lower case.
// All other charcters map to 255. Other code depends on this choice: don't touch it. 
pub const ACGT_TO_0123: [u8; 256] = [255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 255, 1, 255, 255, 255, 2, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 3, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 255, 1, 255, 255, 255, 2, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 3, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255];

#[allow(non_snake_case)] // C-array is an established convention in BWT indexes
fn popcounts_to_C_array(popcounts: &[usize]) -> Vec<usize> {
    let sigma = popcounts.len();
    let mut C: Vec<usize> = vec![0; sigma];
    for c in 0..sigma {
        for d in (c + 1)..sigma {
            C[d] += popcounts[c];
        }
    }

    // Plus one for the ghost dollar
    #[allow(clippy::needless_range_loop)] // Is perfectly clear this way
    for c in 0..sigma {
        C[c] += 1;
    }

    C

}

#[allow(non_snake_case)] // C-array is an established convention in BWT indexes
pub(crate) fn get_C_array(rows: &[bitvec::vec::BitVec::<u64, Lsb0>]) -> Vec<usize> {
    let popcounts: Vec<usize> = rows.iter().map(|row| row.count_ones()).collect(); 
    popcounts_to_C_array(&popcounts)
}

#[allow(non_snake_case, dead_code)] // C-array is an established convention in BWT indexes
pub(crate) fn get_C_array_parallel(rawrows: &[bitvec::vec::BitVec::<u64, Lsb0>], n_threads: usize) -> Vec<usize> {
    let sigma = rawrows.len();
    assert!(sigma > 0);

    let thread_pool = rayon::ThreadPoolBuilder::new().num_threads(n_threads).build().unwrap();
    thread_pool.install(||{
        let popcounts: Vec<usize> = rawrows.par_iter().map(|row| row.count_ones()).collect(); 
        popcounts_to_C_array(&popcounts)
    })
}

/// Reverses the given ASCII DNA sequence and replaces each nucleotide with its complement.
pub fn reverse_complement_in_place(seq: &mut [u8]){
    jseqio::reverse_complement_in_place(seq);
}

#[allow(dead_code)]
pub(crate) struct FastXReader{
    inner: jseqio::reader::DynamicFastXReader
}

impl crate::SeqStream for FastXReader{
    fn stream_next(&mut self) -> Option<&[u8]> {
        self.inner.read_next().unwrap().map(|x| x.seq)
    }
}

/// Creates a [crate::SeqStream] out of a slice of ASCII sequences.
pub struct SliceSeqStream<'a>{
    slices: &'a [ &'a [u8]],
    cur_slice_idx: usize,
}

impl<'a> SliceSeqStream<'a> {
    /// Creates a [crate::SeqStream] out of a slice of ASCII sequences.
    pub fn new(slices: &'a [&'a [u8]]) -> Self {
        Self {slices, cur_slice_idx: 0}
    }
}

impl crate::SeqStream for SliceSeqStream<'_> {
    fn stream_next(&mut self) -> Option<&[u8]> {
        if self.cur_slice_idx == self.slices.len() {
            None
        } else {
            let s = self.slices[self.cur_slice_idx];
            self.cur_slice_idx += 1;
            Some(s)
        }
    } 
}

/// Creates a [crate::SeqStream] out of a slice of ascii vectors.
pub struct VecSeqStream<'a> {
    seqs: &'a [Vec<u8>],
    cur_seq_idx: usize,
}

impl<'a> VecSeqStream<'a> {
    /// Creates a [crate::SeqStream] out of a slice of ascii vectors.
    pub fn new(seqs: &'a [Vec<u8>]) -> Self {
        Self {seqs, cur_seq_idx: 0}
    }
}

impl crate::SeqStream for VecSeqStream<'_> {
    fn stream_next(&mut self) -> Option<&[u8]> {
        if self.cur_seq_idx == self.seqs.len() {
            None
        } else {
            let s = &self.seqs[self.cur_seq_idx];
            self.cur_seq_idx += 1;
            Some(s)
        }
    } 
}


// This function runs in parallel, so a rayon thread pool must be initialized.
pub(crate) fn parallel_bitvec_concat(bitvecs: Vec<BitVec>) -> BitVec {
    if bitvecs.len() == 1 {
        return bitvecs.into_iter().next().unwrap(); // Nothing to do
    }
    let total_length = bitvecs.iter().fold(0_usize, |acc, s| acc + s.len());
    let n_words = total_length.div_ceil(64);
    let mut output_data = vec![0_u64; n_words];

    let mut exclusive_input_bitslices = Vec::<&BitSlice>::new(); // One per nonempty input bitvec
    let mut exclusive_output_word_ranges = Vec::<Range<usize>>::new(); // One per nonempty input bitvec

    // Figure out which words will be potentially shared between input slices in the concatenation,
    // and set those bits. Push the exclusive input and output regions to vecs.
    let mut bits_so_far = 0_usize;
    for s in bitvecs.iter() {
        if s.is_empty() {
            continue // Nothing to concatenate
        }

        // Copy the part that falls in the first output word
        let first_word = bits_so_far / 64; // The first word that will be written to
        let first_word_bit_offset = bits_so_far % 64; // Index of the first bit that is written in the first word
        let n_bits_written_to_first_word = min(s.len(), 64 - first_word_bit_offset);
        let first_out = BitSlice::from_element_mut(&mut output_data[first_word]);
        let first_out_slice = &mut first_out[first_word_bit_offset..first_word_bit_offset + n_bits_written_to_first_word];
        first_out_slice.copy_from_bitslice(&s[0..n_bits_written_to_first_word]);

        // Copy the part that falls in the last output word (can be the same
        // as the first output word but that's okay).
        bits_so_far += s.len();
        let last_bit = bits_so_far - 1; // >= 0 because s.len() > 0
        let last_word = last_bit / 64;
        let last_word_bit_offset = last_bit % 64; // Index of the last bit that is written in the last word
        let n_bits_written_to_last_word = min(s.len(), last_word_bit_offset + 1);
        let last_out_word = BitSlice::from_element_mut(&mut output_data[last_word]);
        let last_out_slice = &mut last_out_word[(1 + last_word_bit_offset - n_bits_written_to_last_word)..=last_word_bit_offset];
        let last_in_slice = &s[(s.len() - n_bits_written_to_last_word)..];
        last_out_slice.copy_from_bitslice(last_in_slice);

        let first_exclusive_word = (first_word + 1) as isize;
        let last_exclusive_word = last_word as isize - 1;

        if first_exclusive_word <= last_exclusive_word {
            // Nonempty range of exclusive words
            exclusive_input_bitslices.push(&s[n_bits_written_to_first_word..(s.len()-n_bits_written_to_last_word)]);
            exclusive_output_word_ranges.push(first_exclusive_word as usize .. last_exclusive_word as usize +1);
        }

    }

    let exclusive_output_word_ranges = split_to_mut_regions(&mut output_data, exclusive_output_word_ranges);

    // Copy non-overlapping parts in parallel
    assert_eq!(exclusive_input_bitslices.len(), exclusive_output_word_ranges.len());
    exclusive_output_word_ranges.into_iter().enumerate().par_bridge().for_each(|(i, out_word_range)| {
        let out = BitSlice::from_slice_mut(out_word_range);
        out.copy_from_bitslice(exclusive_input_bitslices[i]);
    });

   let mut concat = BitVec::from_vec(output_data); // Reinterpret as BitVec
   concat.truncate(total_length); // Get rid of the tail in the last word
   concat

}

/// Returns a mutable slice for every region in `regions`.
///
/// ## Preconditions  (guaranteed by the caller)
/// * All ranges are inside `0..v.len()`.
/// * The ranges are sorted by `start` and do **not** overlap.
pub(crate) fn split_to_mut_regions(
    v: &mut Vec<u64>,
    regions: Vec<Range<usize>>,
) -> Vec<&mut [u64]> {
    let mut result = Vec::with_capacity(regions.len());
    let mut tail: &mut [u64] = v.as_mut_slice();
    let mut consumed = 0; // absolute index we have reached in `v`

    for r in regions {
        // translate the absolute `start` to an index inside `tail`
        let rel_start = r.start - consumed;
        let len       = r.end   - r.start;

        // First split off everything before the wanted region …
        let (_, after_start)    = tail.split_at_mut(rel_start);
        // ... then split that remainder into the desired region and the rest.
        let (region, after_end) = after_start.split_at_mut(len);

        result.push(region); // keep the region
        tail = after_end; // keep working with the suffix
        consumed = r.end; // advance the absolute cursor
    }

    result
}

pub(crate) fn bitvec_to_simple_sds_raw_bitvec(mut bv: bitvec::vec::BitVec::<u64, Lsb0>) -> simple_sds_sbwt::raw_vector::RawVector {
    // TODO: We really hope that usize equals u64 here, otherwise this this is probably broken.
    // Let's use the deserialization function in simple_sds_sbwt for a raw bitvector.
    // It requires the following header:
    let mut header = [0u64, 0u64]; // bits, words
    header[0] = bv.len() as u64; // Assumes little-endian byte order
    header[1] = bv.len().div_ceil(64) as u64;

    let header_bytes = bytemuck::cast_slice(&header);

    // Make sure the leftover bits in the last word are zeros. Simple-sds
    // depends on this, but the bitvec crate does not guarantee this!
    // The undefined padding bytes have broken my code before, so this is
    // crucial.
    let original_len = bv.len();
    bv.resize(original_len.next_multiple_of(64), false);
    bv.resize(original_len, false);

    let raw_data = bytemuck::cast_slice(bv.as_raw_slice());
    let mut data_with_header = Cursor::new(header_bytes).chain(Cursor::new(raw_data));

    simple_sds_sbwt::raw_vector::RawVector::load(&mut data_with_header).unwrap()
}

pub(crate) fn segment_range(range: Range<usize>, n_pieces: usize) -> Vec<Range<usize>> {
    let segment_len = range.len().div_ceil(n_pieces);
    let mut pieces: Vec<Range<usize>> = vec![];
    for t in 0..n_pieces{
        let mut s = range.start + t*segment_len;
        let mut e = range.start + min((t+1)*segment_len, range.len());
        if s >= range.end { // Happens e.g. if range.len() == 1 and n_pieces == 10
            s = range.end;
            e = range.end;
        }
        pieces.push(s..e); // Final segments may be empty. Is ok.
    }
    pieces
}

#[cfg(test)]
pub(crate) fn gen_random_dna_string(len: usize, seed: u64) -> Vec<u8> {
    use rand_chacha::rand_core::{RngCore, SeedableRng};

    let mut rng = rand_chacha::ChaCha20Rng::seed_from_u64(seed);
    (0..len).map(|_| { 
        match rng.next_u64() % 4 {
            0 => b'A',
            1 => b'C',
            2 => b'G',
            3 => b'T',
            _ => panic!("Impossible")
        }
    }).collect()
}


// Calls the callback for each bit in the range. This seems to be
// about 6 times faster than using the iter_ones() function of BitVec
// in the bitvec::crate.
#[inline(always)] // This made a 25% difference in running time
pub fn for_each_one_bit<F>(words: &[u64], range: Range<usize>, mut cb: F)
where
    F: FnMut(usize),
{
    let s = range.start;
    let e = range.end;

    if words.is_empty() || s >= e {
        return;
    }

    let bit_len = words.len() * 64;
    if s >= bit_len {
        return;
    }

    let start_word = s / 64;
    let end_word = (e - 1) / 64; // inclusive

    // Helper: iterate 1-bits in a single word, adding `base_bit` to positions.
    #[inline]
    fn scan_word<F>(mut word: u64, base_bit: usize, cb: &mut F)
    where
        F: FnMut(usize),
    {
        while word != 0 {
            let tz = word.trailing_zeros() as usize;
            cb(base_bit + tz);
            word &= word - 1; // clear lowest set bit
        }
    }

    // Single-word range: mask both ends in the same word.
    if start_word == end_word {
        let start_bit = s % 64;
        let end_bit = e % 64; // exclusive

        let mut word = words[start_word];

        // Mask out bits before s
        word &= !0u64 << start_bit;

        // Mask out bits at/after e
        if end_bit != 0 {
            word &= (1u64 << end_bit) - 1;
        }

        scan_word(word, start_word * 64, &mut cb);
        return;
    }

    // --- First word: mask only the low bits ---
    {
        let start_bit = s % 64;
        let mut word = words[start_word];

        word &= !0u64 << start_bit;
        scan_word(word, start_word * 64, &mut cb);
    }

    // --- Middle words: no masking at all ---
    for (rel_idx, &word) in words[start_word + 1..end_word].iter().enumerate() {
        scan_word(word, (start_word + 1 + rel_idx) * 64, &mut cb);
    }

    // --- Last word: mask only the high bits ---
    {
        let end_bit = e % 64; // exclusive; 0 means "all 64 bits are in range"
        let mut word = words[end_word];

        if end_bit != 0 {
            word &= (1u64 << end_bit) - 1;
        }
        scan_word(word, end_word * 64, &mut cb);
    }
}

#[cfg(test)]
mod tests {

    use rand::{Rng, SeedableRng};

    use super::{for_each_one_bit, *};

    #[test]
    fn parallel_concatenation() {
        // Generate 1001 pseudorandom bit vectors with lengths between 0 and 256 bits
        let seed = 42;
        let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
        let mut bitvecs: Vec<BitVec> = Vec::new();
        for i in 0..1000 {
            let len = rng.gen_range(0, 256);
            if i == 500 {
                // Add an empty bit vector
                bitvecs.push(BitVec::new());
            }
            let mut bits = BitVec::with_capacity(len);
            for _ in 0..len {
                bits.push(rng.gen_bool(0.5));
            }
            bitvecs.push(bits);
        }

        let true_concat = bitvecs.iter().fold(BitVec::new(), |mut acc, v| {
            acc.extend_from_bitslice(v);
            acc
        });

        let thread_pool = rayon::ThreadPoolBuilder::new().num_threads(3).build().unwrap();
        let our_concat = thread_pool.install(|| {
            parallel_bitvec_concat(bitvecs)
        });

        assert_eq!(true_concat.len(), our_concat.len());
        assert_eq!(true_concat, our_concat);
    }

    fn check_for_each_one_bit(words: &[u64], range: Range<usize>) {
        let bv: bitvec::vec::BitVec::<u64, Lsb0> = bitvec::vec::BitVec::from_vec(words.to_vec());
        let true_bits: Vec<usize> = bv.iter_ones().filter(|b| range.contains(b)).collect();

        let mut our_bits = Vec::<usize>::new();
        for_each_one_bit(words, range, |i| our_bits.push(i));

        assert_eq!(true_bits, our_bits);
    }

    #[test]
    fn for_each_one_bit_zero_word() {
        let words = [0_u64];
        check_for_each_one_bit(&words, 0..64);
    }

    #[test]
    fn for_each_one_bit_empty() {
        let words = [u64::MAX, u64::MAX, u64::MAX];

        for s in 0..=64*3 {
            check_for_each_one_bit(&words, s..s);
        }
        check_for_each_one_bit(&words, 64*3..1000); // Out of bounds
    }

    #[test]
    fn for_each_one_bit_one_word() {
        let mut rng = rand::rngs::StdRng::seed_from_u64(1234);
        let mut bits = bitvec![u64, Lsb0;];
        let range = 10..50;
        for i in 0..64 {
            if range.contains(&i) {
                bits.push(rng.gen_bool(0.5));
            } else {
                bits.push(true); // These bits should be masked out
            }
        }
        eprintln!("{}", bits);
        let words = bits.into_vec();
        check_for_each_one_bit(words.as_slice(), 0..64);
        check_for_each_one_bit(words.as_slice(), range);
        check_for_each_one_bit(words.as_slice(), 20..40);
    }

    #[test]
    fn for_each_one_bit_multiple_words() {
        let mut rng = rand::rngs::StdRng::seed_from_u64(4123);
        let mut bits = bitvec![u64, Lsb0;];
        let n_bits = 512;
        for s in (0..n_bits).step_by(32) {
            for e in (s..n_bits).step_by(128) {
                let range = s..e;
                for i in 0..n_bits {
                    if range.contains(&i) {
                        bits.push(rng.gen_bool(0.5));
                    } else {
                        bits.push(true); // These bits should be masked out
                    }
                }
                //eprintln!("{}", bits);
                let words = bits.clone().into_vec();
                check_for_each_one_bit(words.as_slice(), s..e);
            }
        }
    }

    #[test]
    fn for_each_one_bit_hand_crafted_edge_cases() {
        let words: &[u64] = &[0,0,0,u64::MAX,u64::MAX,0,u64::MAX,u64::MAX];
        check_for_each_one_bit(words, 0..words.len()*64);
        check_for_each_one_bit(words, 10..words.len()*64-10);
    }
}

pub fn for_each_run_with_key<T: Eq, KeyType: Eq, F1: Fn(&T) -> KeyType, F2: FnMut(Range<usize>)>(items: &[T], key_fn: F1, mut callback: F2) {
    if items.is_empty() { return }

    let mut run_start = 0;
    let n = items.len();
    for i in 1..n {
        if key_fn(&items[i]) != key_fn(&items[i-1]) {
            callback(run_start..i);
            run_start = i;
        }
    }
    // Final run
    callback(run_start..n);
}