sbwt 0.4.2 - Docs.rs

//! The [SbwtIndex] data structure. Construct with [SbwtIndexBuilder](crate::SbwtIndexBuilder).

use std::io::Read;
use std::io::Write;
use byteorder::ReadBytesExt;

use byteorder::LittleEndian;
use num::traits::ToBytes;
use rayon::iter::IndexedParallelIterator;
use rayon::iter::IntoParallelIterator;
use rayon::iter::ParallelIterator;
use bitvec::prelude::*;

use crate::compact_int_vector::CompactIntVector;
use crate::compact_int_vector::CompactIntVectorMutSlice;
use crate::sdsl_compatibility::load_known_width_sdsl_int_vector;
use crate::sdsl_compatibility::load_sdsl_bit_vector;
use crate::subsetseq::*;
use crate::util;
use crate::util::ACGT_TO_0123;
use crate::util::DNA_ALPHABET;

pub const CARGO_API_VERSION: &str = env!("CARGO_PKG_VERSION");

/// The SBWT index data structure. Construct with [SbwtIndexBuilder](crate::SbwtIndexBuilder). For the [SubsetSeq] trait implementation,  we recommend using the bit matrix implementation [SubsetMatrix]. 
///
/// # SBWT index 
/// 
/// The SBWT index is a compressed index for searching for k-mers in a set of k-mers. It can be seen a version of the
/// FM-index on sets of k-mers. Here we give a brief overview of the data structure and key concepts that are
/// helpful for understanding the API. For further details, see the [paper](https://doi.org/10.1137/1.9781611977714.20).
/// 
/// ## SBWT graph
/// 
/// To understand the SBWT index, it's helpful to first understand the SBWT graph.
/// The node-centric de Bruijn graph of a set of k-mers R (also called *a k-spectrum*) is a directed graph where the nodes are the k-mers in R,
/// and there is an edge from x to y if x[1..k) = y[0..k-1). The label of the edge is the last character of y.
/// The **SBWT graph** is a modified version of the node-centric de Bruijn graph. There are two modifications:
/// 1. We add a set R' of "dummy nodes". First, the **source set** R' ⊆ R is the subset of k-mers in R that 
///    do not have an incoming edge in the de Bruijn graph, that is x ∈ R' iff x[0..k-1) is not the suffix of 
///    any k-mer in R. The set of dummy nodes is the set of all proper prefixes of the k-mers in R'. We pad the 
///    dummy nodes with dollar symbols from the left so that all dummy nodes have length k. The set R ∪ R' is called 
///    the **padded k-spectrum**. We add all de Bruijn graph edges between overlaps of the padded k-spectrum, that is,
///    for every x ∈ R ∪ R' and y ∈ R ∪ R' such that x[1..k) = y[0..k-1), we add an edge from x to y in the SBWT graph.
///    As a special case, the empty string $^k is always included in R', and the incoming self-loop edge labeled with $ is not included.
/// 2. For every node in R ∪ R' that has more than one incoming edge, we delete all incoming edges except the 
///    one that comes from the colexicographically smallest k-mer. A k-mer x is colexicographically smaller than k-mer y iff the reverse of x is lexicographically smaller than the reverse of y.
/// 
/// 
/// For example, below is the SBWT graph of all 4-mers of strings ["TGTTTG", "TTGCTAT", "ACGTAGTATAT", "TGTAAA"].
/// The source node set is shown in blue, the dummy node set R' in orange, and the de Bruijn graph edges that 
/// have been removed are shown in red.
/// 
/// ![SBWT graph][sbwt_graph] 
/// 
/// ## SBWT definition
/// 
/// The SBWT is a sequence of subsets of {A,C,G,T} such that the i-th subset contains the edge labels of outgoing
/// edges from the i-th k-mer in the SBWT graph in colexicographic order of the k-mers. 
/// If we take the example from above and stack the nodes in colexicographic order, we get: 
///
/// ![SBWT graph][sbwt_sequence] 
/// 
/// The SBWT subset sequence in this example is the sequence of outgoing edge label sets read from top to bottom:
/// {A,T}, {C}, {}, {A}, {T}, {T}, {A,G,T}, {}, {}, {G}, {T}, {T}, {T}, {T}, {C}, {G}, {A}, {}, {}, {A}, {A}, {A}, {A,T}, {T}, {G}.
/// The key property that makes the SBWT index work is that the i-th outgoing edge with a given label on the right column
/// in the picture is the same edge as the i-th incoming edge with the same label on the left column.
/// Thanks to this property, once the subset sequnce has been constructed, the k-mers
/// and the graph can be discarded, with no loss of information. That is, the k-mers and the graph can be reconstructed from the
/// subset sequence alone.
/// 
/// Given the subset sequence, there is also an algorithm to search for a k-mer in O(k) time. The algorithm is essentially the same
/// as the search algorithm on the FM-index. Given a k-mer P[0..k], at iteration i, we have the range of rows in the picture whose
/// k-mers have P[0..i) as a suffix. When i = 0, we have the empty suffix P[0..0), which is a suffix of all k-mers.
/// To update the range for the next iteration, we follow the edges labeled with the next character of the pattern, using the key property
/// mentioned above, as shown in the figure below for query k-mer TATA. This update step can be implemented efficiently by preprocessing the SBWT set sequence for
/// *subset rank queries*. See [this paper](https://doi.org/10.1137/1.9781611977714.20) for more on how to implement these rank queries. We provide
/// an implementation based on a bit matrix at [SubsetMatrix]. Any struct implementing the [SubsetSeq] trait can be used to query the SBWT.
///  
/// ![SBWT graph][sbwt_search] 
/// 
/// 
#[embed_doc_image::embed_doc_image("sbwt_graph", "doc_images/sbwt_graph.svg")] 
#[embed_doc_image::embed_doc_image("sbwt_sequence", "doc_images/sbwt_figure.drawio.svg")] 
#[embed_doc_image::embed_doc_image("sbwt_search", "doc_images/sbwt_figure_with_search.drawio.png")] // This is as .png because there is a bug in vertical centering in the svg export of drawio.

#[derive(Clone, Eq, PartialEq, Debug)]
#[allow(non_snake_case)] // C-array is an established convention in BWT indexes
pub struct SbwtIndex<SS: SubsetSeq> {
    pub(crate) sbwt: SS, // pub(crate) for testing from submodules
    n_kmers: usize,
    k: usize,
    C: Vec<usize>, // Cumulative character counts (includes one ghost dollar)
    prefix_lookup_table: PrefixLookupTable,
}

/// An enum listing SbwtIndex types built on different subset rank implementations provided in this crate. 
/// Currently, only the [SubsetMatrix] structure is supported, but in the future there might be more.
pub enum SbwtIndexVariant {
    SubsetMatrix(SbwtIndex<SubsetMatrix>),
    //SubsetConcat(SbwtIndex<SubsetConcat>), // Remember to update the doc comment if a variant is added
}

/// Loads an index that is wrapped in an enum describing the used subset rank structure type. 
/// The format includes a type identifier so the correct variant can later be loaded with [load_sbwt_index_variant].
pub fn write_sbwt_index_variant(sbwt: &SbwtIndexVariant, out: &mut impl std::io::Write) -> std::io::Result<usize> {
    match sbwt {
        SbwtIndexVariant::SubsetMatrix(sbwt) => {
            out.write_all(&(b"SubsetMatrix".len() as u64).to_le_bytes())?;
            out.write_all(b"SubsetMatrix")?;
            sbwt.serialize(out)
        }
    }
}

/// Loads an index that was stored with [write_sbwt_index_variant]. This includes a type identifier
/// to load the correct subset rank variant.
pub fn load_sbwt_index_variant(input: &mut impl std::io::Read) -> Result<SbwtIndexVariant, Box<dyn std::error::Error>> {
    let type_id_len = input.read_u64::<LittleEndian>().unwrap();
    let mut type_id = vec![0_u8; type_id_len as usize];
    input.read_exact(&mut type_id)?;

    if type_id == b"SubsetMatrix" {
        Ok(SbwtIndexVariant::SubsetMatrix(SbwtIndex::<SubsetMatrix>::load(input)?))
    } else {
        Err("Unknown SBWT index type".into())
    }

}

/// Loads an index that was previously serialized either with the C++ API <https://github.com/algbio/SBWT>,
/// or the associated CLI. Supports only version v0.1.
#[allow(non_snake_case)] // For C-array
pub fn load_from_cpp_plain_matrix_format<R: std::io::Read>(input: &mut R) -> std::io::Result<SbwtIndexVariant> {

    // The file format with the CLI is:
    // [type id len] [type id] [version string id] [version string] [data]

    // For example:
    // 00000000  0c 00 00 00 00 00 00 00  70 6c 61 69 6e 2d 6d 61  |........plain-ma|
    // 00000010  74 72 69 78 04 00 00 00  00 00 00 00 76 30 2e 31  |trix........v0.1|
    // 00000020  b6 09 2c 0f 00 00 00 00  ff ff ff 8b 48 44 2a 95  |..,.........HD*.|
    
    // We only support type-id "plain-matrix" and version v0.1.

    // The file format of the raw API is:
    // [version string id] [version string] [data]

    // We can tell which format we have by checking what the first encoded string is:
    // - If it's the version string, we check it and proceed to load the sbwt.
    // - If it's plain-matrix, we ignore it, then read the next string, and check that it's the correct version.
    // - If it's a string other than plain-matrix or a version string, we throw an error

    let SUPPORTED_CPP_SBWT_VARIANT = b"plain-matrix"; // If you change this, remember to update all the comments above.
    let SUPPORTED_CPP_FILE_FORMAT = b"v0.1"; // If you change this, remember to update all the comments above, including the doc comment.

    let mut string_length = input.read_u64::<LittleEndian>().unwrap();
    if string_length > 100 { // The first string is never this long
        return Err(std::io::ErrorKind::InvalidData.into());
    }
    let mut string_buf = vec![0u8; string_length as usize];
    input.read_exact(&mut string_buf).unwrap();

    if string_buf == SUPPORTED_CPP_SBWT_VARIANT { 
        // Ok. Ignore and read the next string.

        string_length = input.read_u64::<LittleEndian>().unwrap();
        if string_length > 100 { // The second string is never this long
            return Err(std::io::ErrorKind::InvalidData.into());
        }
        string_buf = vec![0u8; string_length as usize];
        input.read_exact(&mut string_buf).unwrap();
    }

    if string_buf == SUPPORTED_CPP_FILE_FORMAT { 
        // Ok. Proceed to deserialize. 

        log::info!("Reading bit vectors");
        let A_bits = load_sdsl_bit_vector(input)?;
        let C_bits = load_sdsl_bit_vector(input)?;
        let G_bits = load_sdsl_bit_vector(input)?;
        let T_bits = load_sdsl_bit_vector(input)?;

        let _A_rank_support = load_known_width_sdsl_int_vector(input, 64)?; // Ignore: we build our own
        let _C_rank_support = load_known_width_sdsl_int_vector(input, 64)?; // Ignore: we build our own
        let _G_rank_support = load_known_width_sdsl_int_vector(input, 64)?; // Ignore: we build our own
        let _T_rank_support = load_known_width_sdsl_int_vector(input, 64)?; // Ignore: we build our own 

        let _suffix_group_starts = load_sdsl_bit_vector(input); // Ignore: we don't use this

        log::info!("Reading the C-array");
        let C_array_byte_length = input.read_u64::<LittleEndian>()? as usize;
        assert!(C_array_byte_length % 8 == 0);
        let C_array_len = C_array_byte_length / 8;
        let mut C_array = Vec::<usize>::with_capacity(C_array_len);
        for _ in 0..C_array_len {
            C_array.push(input.read_u64::<LittleEndian>()? as usize);
        }

        log::info!("Reading the precalc table");
        let precalc_array_byte_length = input.read_u64::<LittleEndian>()?;
        assert!(precalc_array_byte_length % 16 == 0);
        let mut precalc_array_bytes = vec![0u8; precalc_array_byte_length as usize];
        input.read_exact(precalc_array_bytes.as_mut_slice())?;

        log::info!("Reading constants");
        let precalc_k = input.read_u64::<LittleEndian>()? as usize;
        let n_nodes = input.read_u64::<LittleEndian>()? as usize;
        let n_kmers = input.read_u64::<LittleEndian>()? as usize;
        let k = input.read_u64::<LittleEndian>()? as usize;

        log::info!("Permuting the precalc table");
        // For some stupid reason the lookup table in the Rust code stores the ranges in the
        // lexicographic order of the prefixes, not colexicographic like in the C++ code, which
        // also would make more sense since then the intervals are non-decreasing. We could change
        // to colex order in the Rust code, but that would break index compatibility. So we permute
        // the lookup table here to the lexicographic order.
        let n_ranges = (precalc_array_byte_length/16) as usize;
        let mut ranges = vec![0..0; n_ranges];
        let mut buf = [0_u8; 8];
        for i in 0..n_ranges {

            buf.copy_from_slice(&precalc_array_bytes[(2*i)*8..(2*i+1)*8]);
            let left = i64::from_le_bytes(buf);
            buf.copy_from_slice(&precalc_array_bytes[(2*i+1)*8..(2*i+2)*8]);
            let right = i64::from_le_bytes(buf);

            let mut lut_idx = 0; // Index in the new lookup table
            for j in 0..precalc_k {
                lut_idx = (lut_idx << 2) | ((i >> (2*j)) & 0x3); // Trust me
            }

            ranges[lut_idx] = if left == -1 {
                0..0
            } else {
                left as usize .. (right+1) as usize // +1 to make the end exclusive
            }
        }
        if precalc_k == 0 {
            ranges.push(0..n_nodes); // Special case: range of the empty string
        }
        drop(precalc_array_bytes); // Free some memory

        log::info!("Building rank structures");
        let mut subset_rank = SubsetMatrix::new_from_bit_vectors(vec![A_bits, C_bits, G_bits, T_bits]);
        subset_rank.build_rank();

        let prefix_lut = PrefixLookupTable{ranges, prefix_length: precalc_k};

        Ok(SbwtIndexVariant::SubsetMatrix(SbwtIndex::from_components(subset_rank, n_kmers, k, C_array, prefix_lut)))

    } else {
        Err(std::io::ErrorKind::InvalidData.into())
    }

}


// When non-compatible changes to the serialization format occur, update the version number here to the current version 
const SERIALIZATION_MAGIC_STRING: &[u8] = b"sbwtfile-v0.3.0"; 

impl<SS: SubsetSeq> SbwtIndex<SS> {

    /// Number of k-mers in the index, not counting dummy k-mers.
    pub fn n_kmers(&self) -> usize {
        self.n_kmers
    }

    /// Number of sets in the SBWT.
    pub fn n_sets(&self) -> usize {
        self.sbwt.len()
    }

    /// Length of the k-mers in the index.
    pub fn k(&self) -> usize {
        self.k
    }

    /// Maps A,C,G,T to 0,1,2,3.
    /// Panics if c is not a DNA character.
    pub fn char_idx(&self, c: u8) -> usize {
        let idx = ACGT_TO_0123[c as usize] as usize;
        if idx > DNA_ALPHABET.len() {
            panic!("Invalid character: {}", char::from(c));
        }
        idx
    }

    /// Returns the alphabet ACGT of the index in ascii.
    #[allow(unused)]
    pub fn alphabet(&self) -> &[u8] {
        &DNA_ALPHABET
    }

    pub fn interval_of_empty_string(&self) -> std::ops::Range<usize> {
        0..self.n_sets()
    }

    /// Returns the C-array of the index.
    /// The array is such that C\[i\] is 1 plus the number of sets in the SBWT that
    /// contain a character that is smaller than the i-th character in the alphabet.
    #[allow(non_snake_case)]
    #[allow(unused)]
    fn C(&self) -> &[usize] {
        self.C.as_slice()
    }

    /// Writes the index to the writer and retuns the number of bytes written.
    /// The array can later be loaded with [SbwtIndex::load].
    pub fn serialize<W: std::io::Write>(&self, out: &mut W) -> std::io::Result<usize> {
        let mut n_written = 0_usize;

        let magic_string_length = [SERIALIZATION_MAGIC_STRING.len() as u8];
        n_written += util::write_bytes(out, &magic_string_length)?;
        n_written += util::write_bytes(out, SERIALIZATION_MAGIC_STRING)?;

        n_written += self.sbwt.serialize(out)?;

        // We're not using serde because we want full control over the bytes
        // in order to guarantee compatibility across languages

        n_written += util::write_bytes(out, &self.n_kmers.to_le_bytes())?;
        n_written += util::write_bytes(out, &self.k.to_le_bytes())?;
        n_written += util::write_bytes(out, &self.C.len().to_le_bytes())?; // TODO: check at build time that usize = u64, crash otherwise
        n_written += util::write_bytes(
            out,
            &self
                .C
                .iter()
                .flat_map(|x| x.to_le_bytes())
                .collect::<Vec<u8>>(),
        )?;

        n_written += self.prefix_lookup_table.serialize(out)?; 

        Ok(n_written)
    }

    /// Loads an index that was previously serialized with [SbwtIndex::serialize].
    #[allow(non_snake_case)] // For C-array
    pub fn load<R: std::io::Read>(input: &mut R) -> std::io::Result<Self> {

        let magic_string_length = input.read_u8().unwrap();
        let mut magic_string_buf = vec![0u8; magic_string_length as usize];
        input.read_exact(&mut magic_string_buf).unwrap();
        if magic_string_buf != SERIALIZATION_MAGIC_STRING {
            panic!("Error loading SBWT: incorrect version string: expected \"{}\", found \"{}\"", String::from_utf8(SERIALIZATION_MAGIC_STRING.to_vec()).unwrap(), String::from_utf8_lossy(&magic_string_buf));
        }

        let subset_rank = SS::load(input)?;

        let n_kmers = byteorder::ReadBytesExt::read_u64::<LittleEndian>(input).unwrap() as usize;
        let k = byteorder::ReadBytesExt::read_u64::<LittleEndian>(input).unwrap() as usize;
        let C_len = byteorder::ReadBytesExt::read_u64::<LittleEndian>(input).unwrap() as usize;

        let mut C_bytes = vec![0_u8; C_len * 8];
        input.read_exact(&mut C_bytes)?;

        let C = C_bytes
            .chunks_exact(8)
            .map(|chunk| {
                let mut arr = [0_u8; 8];
                arr.copy_from_slice(chunk);
                u64::from_le_bytes(arr) as usize
            })
            .collect::<Vec<usize>>();

        let prefix_lookup_table = PrefixLookupTable::load(input)?;

        let index = Self {
            sbwt: subset_rank,
            n_kmers,
            k,
            C,
            prefix_lookup_table
        };

        Ok(index)

    }

    /// Returns the edge label on the incoming edge to the i-th node in colexicographic order 
    /// in the SBWT graph (not the same graph as the de Bruijn graph!), or None if the node has 
    /// no incoming edge. This is well-defined since every node in the SBWT graph has at most 1 incoming edge.
    pub fn inlabel(&self, i: usize) -> Option<u8> {
        if i == 0 {
            return None; // Source node
        }

        for c_idx in 0..DNA_ALPHABET.len() {
            if self.C[c_idx] > i {
                return Some(DNA_ALPHABET[c_idx-1]); // c_idx > 0 because of the source node
            }
        }
        Some(*DNA_ALPHABET.last().unwrap()) // Last character

        // For example, if
        // C = 1, 234, 654, 954
        //
        // Then:
        // [0..1) $
        // [1..234) A
        // [234..654) C
        // [654..954) G
        // [954..n) T
    }

    /// Build select support for the SBWT subset sequence. This is required for
    /// [SbwtIndex::inverse_lf_step] and [SbwtIndex::access_kmer].
    pub fn build_select(&mut self) {
        self.sbwt.build_select();
    }

    /// A low-level function returning `C[char_idx] + SBWT.rank(char_idx, i)`.
    pub fn lf_step(&self, i: usize, char_idx: usize) -> usize {
        self.C[char_idx] + self.sbwt.rank(char_idx as u8, i)
    }

    /// The inverse function of [`SbwtIndex::lf_step`].
    /// Requires that the [select support](SbwtIndex::build_select) has been built.
    /// Returns None if called on the source node of the SBWT graph.
    pub fn inverse_lf_step(&self, i: usize) -> Option<usize> {
        match self.inlabel(i){
            None => None, // Source node
            Some(c) => {
                let char_idx = self.char_idx(c);
                let rank_within_c = i - self.C[char_idx];
                Some(self.sbwt.select(char_idx as u8, rank_within_c).unwrap())
            }
        }
    }

    /// Requires that the [select support](SbwtIndex::build_select) has been built.
    /// Accesses the k-mer associated with the node with colexicographic rank
    /// `colex_rank`. Note that this k-mer may contain dollar symbols if it's
    /// a dummy node. The k-mer string is pushed to `buf`.
    pub fn push_kmer_to_vec(&self, mut colex_rank: usize, buf: &mut Vec<u8>){
        let buf_start = buf.len();
        for _ in 0..self.k {
            match self.inlabel(colex_rank) {
                Some(c) => {
                    buf.push(c);
                    colex_rank = self.inverse_lf_step(colex_rank).unwrap(); // Can unwrap because c != None
                },
                None => {
                    buf.push(b'$');
                },
            }
        }
        buf[buf_start..buf_start+self.k].reverse();
    } 

    /// Requires that the [select support](SbwtIndex::build_select) has been built.
    /// Accesses the k-mer associated with the node with colexicographic rank
    /// `colex_rank`. Note that this k-mer may contain dollar symbols if it's
    /// a dummy node. The k-mer is retuned as a `Vec<u8>`. To push to an existing
    /// buffer for better performance, see [SbwtIndex::push_kmer_to_vec].
    pub fn access_kmer(&self, colex_rank: usize) -> Vec<u8> {
        let mut buf = Vec::with_capacity(self.k);
        self.push_kmer_to_vec(colex_rank, &mut buf);
        buf
    }

    /// Returns the colexicographic interval of the pattern, if found.
    /// The pattern is given in ascii characters.
    pub fn search(&self, pattern: &[u8]) -> Option<std::ops::Range<usize>> {
        let l = self.prefix_lookup_table.prefix_length;
        match pattern.len() >= l {
            true => {
                // Initialize search from lookup table
                let I = self.prefix_lookup_table.lookup(&pattern[0..l]);
                self.search_from(I, &pattern[l..])
            },
            false => {
                // Search from scratch 
                self.search_from(0_usize..self.sbwt.len(), pattern)
            }
        }
    }

    /// Searches the pattern starting from the given colexicographic interval.
    /// Returns the colexicographic interval after searching the pattern, if found.
    /// The pattern is given in ascii characters.
    pub fn search_from(&self, interval: std::ops::Range<usize>, pattern: &[u8]) -> Option<std::ops::Range<usize>> {
        let mut left = interval.start;
        let mut right = interval.end; 
        for chr in pattern.iter() {
            let c = ACGT_TO_0123[*chr as usize];
            if c as usize > DNA_ALPHABET.len() {
                return None; // Character does not exist in index
            }
            left = self.lf_step(left, c as usize);
            right = self.lf_step(right, c as usize);
            if left >= right {
                return None;
            }
        }
        Some(left..right)
    }

    /// Reconstruct all k-mers in the [padded k-spectrum](#sbwt-graph) in the data structure.
    /// The reconstructed k-mers concatenated into a single ascii string in colexicographic order.
    /// The i-th k-mer starts at position i*k in the string.
    pub fn reconstruct_padded_spectrum(&self, n_threads: usize) -> Vec<u8> 
    where Self: Sync {
        let n_nodes = self.n_sets();
        let k = self.k;

        // Build the last column of the SBWT matrix
        log::info!("Building column {} of the SBWT matrix", k-1);
        let mut labels_in = self.build_last_column();
        let mut labels_out = labels_in.clone();

        // Iterate columns from right to left
        let mut kmers_concat = vec![0u8; n_nodes * k];
        for round in 0..k {
            if round > 0 {
                log::info!("Building column {} of the SBWT matrix", k-1-round);
                self.push_all_labels_forward(&labels_in, &mut labels_out, n_threads);
            }

            for i in 0..n_nodes {
                let pos = k - 1 - round;
                kmers_concat[i * k + pos] = labels_out[i];
            }

            (labels_in, labels_out) = (labels_out, labels_in);
        }

        kmers_concat
    }

    /// Set the prefix lookup table of the data structure.
    pub fn set_lookup_table(&mut self, prefix_lookup_table: PrefixLookupTable){
        self.prefix_lookup_table = prefix_lookup_table;
    }

    /// Get the prefix lookup table of the data structure.
    pub fn get_lookup_table(&self) -> &PrefixLookupTable {
        &self.prefix_lookup_table
    }

    /// Get the SBWT set sequence of the data structure.
    pub fn sbwt(&self) -> &SS {
        &self.sbwt
    }

    /// Internal function: construct from parts.
    #[allow(non_snake_case)]
    pub(crate) fn from_components(subset_rank: SS, n_kmers: usize, k: usize, C: Vec<usize>, prefix_lookup_table: PrefixLookupTable) -> Self {
        Self {sbwt: subset_rank, n_kmers, k, C, prefix_lookup_table}
    }

    /// Construct from existing [`crate::subsetseq::SubsetSeq`].
    #[allow(non_snake_case)]
    pub fn from_subset_seq(subset_rank: SS, n_kmers: usize, k: usize, precalc_prefix_length: usize) -> Self {
        let C = subset_rank.get_C_array();
        let n = subset_rank.len();
        let mut index = Self{sbwt: subset_rank, n_kmers, k, C, prefix_lookup_table: PrefixLookupTable::new_empty(n)};
        index.prefix_lookup_table = PrefixLookupTable::new(&index, precalc_prefix_length);

        index
    }

    /// Private helper function for label propagation.
    /// The output range must have length self.n_sets().
    fn split_output_range_by_char<'a>(&self, output_range: &'a mut[u8]) -> Vec::<&'a mut[u8]>{
        let mut remaining_output_range = &mut output_range[1..]; // Drop index 0 because it's the root of the graph
        let mut output_ranges = Vec::<&mut[u8]>::new();

        for i in 0..self.alphabet().len() {
            let range_length = if i + 1 < self.alphabet().len() {
                self.C[i+1] - self.C[i]
            } else {
                self.n_sets() - self.C[i]
            };

            let (head, tail) = remaining_output_range.split_at_mut(range_length); 
            output_ranges.push(head);
            remaining_output_range = tail;
        }
        assert!(remaining_output_range.is_empty());
        output_ranges
    }

    /// Push labels forward along the SBWT graph.
    /// The effect is this: if `labels_in` is a list of labels, one for each node in the SBWT in colex order, then the output
    /// is those labels pushed forward along the edges in the SBWT graph. Those nodes that do not have a predecessor
    /// (just the root of the graph) get a dollar.
    pub fn push_all_labels_forward(&self, labels_in: &[u8], labels_out: &mut [u8], n_threads: usize) 
    where Self: Sync {

        labels_out[0] = b'$'; // The root has no predecessor so it always gets a dollar.
        let mut remaining_char_output_ranges = self.split_output_range_by_char(labels_out);

        let input_piece_len = self.n_sets().div_ceil(n_threads);

        // Subdivide char output ranges for threads.
        // For each thread one input sbwt range and four (= |ACGT|) output ranges. 
        let mut thread_input_output_ranges = Vec::<(std::ops::Range<usize>, Vec::<&mut[u8]>)>::new();

        for t in 0..n_threads {

            let input_range_start = t*input_piece_len;
            let input_range_end = std::cmp::min((t+1)*input_piece_len, self.n_sets());
            let input_subrange = input_range_start..input_range_end;

            let mut thread_output_subranges: Vec<&mut[u8]> = Vec::new();

            let mut range_tails = Vec::<&mut[u8]>::new();
            for (char_idx, output_range) in remaining_char_output_ranges.into_iter().enumerate() {

                // The length of the output range is equal to the number of occurrences of the current character in the input range
                let output_range_len = self.sbwt.rank(char_idx as u8, input_range_end) - self.sbwt.rank(char_idx as u8, input_range_start);
                let (output_subrange, tail) = output_range.split_at_mut(output_range_len); 
                range_tails.push(tail);

                thread_output_subranges.push(output_subrange);
            }
            remaining_char_output_ranges = range_tails;
            
            thread_input_output_ranges.push((input_subrange, thread_output_subranges));
        }

        let thread_pool = rayon::ThreadPoolBuilder::new().num_threads(n_threads).build().unwrap();
        thread_pool.install(||{
            thread_input_output_ranges.into_par_iter().for_each(|(input_subrange, output_subranges)| {
                self.push_labels_forward(&labels_in[input_subrange.clone()], input_subrange.clone(), output_subranges);
            });
        });
    }

    // Internal helper function. If the i-th outedge leaves from node with colex
    // rank r, and character c, return (r, j), where j is the index of c in the alphabet.
    // If i is equal to the number of outedges, returns (self.n_sets(), DNA_ALPHABET.len()-1)
    fn get_ith_edge(&self, i: usize) -> (usize, usize) {
        assert!(i <= self.n_sets() * DNA_ALPHABET.len());
        let j = crate::util::binary_search_leftmost_that_fulfills_pred(|j| j, |probe| {
            let (char_idx, node_colex) = if probe+1 == self.n_sets() * DNA_ALPHABET.len() {
                // This needs to be a special case because otherwise the binary search
                // will try rank on char |DNA_ALPHABET| at colex position 0, but that
                // character does not exist. Instead we rank one past the end of the
                // SBWT matrix row of the last character.
                (DNA_ALPHABET.len()-1, self.n_sets())
            } else {
                ((probe+1) / self.n_sets(), (probe+1) % self.n_sets())
            };
            self.C[char_idx] - 1 + self.sbwt.rank(char_idx as u8, node_colex) > i
            // -1 is for the dollar.
        }, self.n_sets()*DNA_ALPHABET.len());

        if j == self.n_sets() * DNA_ALPHABET.len() {
            (self.n_sets(), DNA_ALPHABET.len()-1) // Due to the same special case as in the search 
        } else {
            (j % self.n_sets(), j / self.n_sets())
        }
    }

    /// A version of [SbwtIndex::push_all_labels_forward] that stores the labels as
    /// compact integer vectors with elements {0,1, ..., 4}. The symbol 0 is the "dollar" and
    /// symbols 1,2,3,4 are A,C,G and T.
    /// (see the documentation of [SbwtIndex::push_all_labels_forward].
    pub fn push_all_labels_forward_compact(&self, labels_in: &CompactIntVector<3>, labels_out: &mut CompactIntVector<3>, n_threads: usize) 
    where Self: Sync {

        let output_char_ranges = labels_out.split_to_approx_even_mut_ranges(n_threads);
        // The first output range is special because labels_out[0] has to be written a '$'.

        // Tuples (start_colex_rank, start_char_idx, end_colex_rank, end_char_idx)
        // Exclusive ends
        let mut input_edge_ranges: Vec<(usize, usize, usize, usize)> = vec![];
        let mut edges_so_far = 0_usize;
        for r in output_char_ranges.iter() {
            let input_range_len = if edges_so_far == 0 {
                assert!(r.len() > 0);
                r.len() - 1 // No incoming edge to the first node 
            } else {
                r.len()
            };
            let (r_s, c_s) = self.get_ith_edge(edges_so_far);
            let (r_e, c_e) = self.get_ith_edge(edges_so_far + input_range_len);
            edges_so_far += input_range_len;
            input_edge_ranges.push((r_s, c_s, r_e, c_e));
        } 

        assert_eq!(output_char_ranges.len(), input_edge_ranges.len());

        output_char_ranges.into_par_iter().enumerate().for_each(|(i, mut out)| {
            self.push_labels_forward_compact(input_edge_ranges[i], labels_in, &mut out, i == 0);
        })
    }

    /// Internal function. A compact version of [push_labels_forward].
    pub(crate) fn push_labels_forward_compact(&self, input_edge_range: (usize, usize, usize, usize), input_chars: &CompactIntVector<3>, output_char_range: &mut CompactIntVectorMutSlice<3>, first_input_range: bool){

        // Tuple (start_colex_rank, start_char_idx, end_colex_rank, end_char_idx)
        let (r_s, c_s, r_e, c_e) = input_edge_range;
        if (r_s, c_s) == (r_e, c_e) {
            return; // Empty input range -> nothing to do
            // If we don't return here then one of the asserts below will fail.
        }

        let has_dollar = first_input_range; 
        if has_dollar {
            assert!(output_char_range.len() > 0);
            output_char_range.set(0,0); // The "dollar"
        }


        let mut n_pushed = has_dollar as usize; // Skip over dollar if we have it
        for c in c_s..=c_e {
            let mut colex = if c == c_s {
                assert!(self.sbwt.set_contains(r_s, c as u8));
                r_s
            } else {
                match self.sbwt.next_set_with_char(0, c as u8) {
                    Some(i) => i,
                    None => break
                }
            };
            let colex_end = if c == c_e {
                r_e
            } else {
                self.n_sets()
            };

            while colex < colex_end {
                output_char_range.set(n_pushed, input_chars.get(colex));
                n_pushed += 1;
                colex = match self.sbwt.next_set_with_char(colex+1, c as u8) {
                    Some(i) => i,
                    None => break // End of row
                }
            }
        }

        assert_eq!(n_pushed, output_char_range.len());
    }


    /// Internal function. Takes an input range in the SBWT, and a vector of labels (one for each position in the input range).
    /// Output: writes the input labels to the output ranges. Read the code for details. 
    /// The length of output_ranges[j] must be equal to the number of occurrences of character j in the input range.
    pub(crate) fn push_labels_forward(&self, labels: &[u8], sbwt_input_range: std::ops::Range<usize>, output_ranges: Vec<&mut[u8]>){
        self.sbwt.push_labels_forward(labels, sbwt_input_range, output_ranges);
    }

    /// Build the last column of the SBWT matrix.
    pub fn build_last_column(&self) -> Vec<u8> {
        let mut last = Vec::<u8>::with_capacity(self.n_sets());
        last.push(b'$');
        for c_i in 0..self.alphabet().len(){
            let count = self.sbwt().rank(c_i as u8, self.n_sets());
            let c = self.alphabet()[c_i];
            for _ in 0..count {
                last.push(c);
            }
        }
        assert_eq!(last.len(), self.n_sets());
        last
    } 

    /// Build the last column of the SBWT matrix.
    pub fn build_last_column_compact(&self) -> CompactIntVector<3> {
        let mut last = CompactIntVector::<3>::new(self.n_sets());
        last.set(0, 0); // The "dollar"
        let mut cur_idx = 1_usize;
        for c_i in 0..self.alphabet().len(){
            let count = self.sbwt().rank(c_i as u8, self.n_sets());
            for _ in 0..count {
                last.set(cur_idx, c_i+1);
                cur_idx += 1;
            }
        }
        assert_eq!(cur_idx, self.n_sets());
        last
    } 

    // Internal function: marks the first k-mer of every (k-1)-suffix group.
    pub(crate) fn mark_k_minus_1_mers(&self, n_threads: usize) -> bitvec::vec::BitVec 
    where Self: Sync {
        let n_nodes = self.n_sets();
        let k = self.k();

        // Build the last column of the SBWT matrix
        log::info!("Building column {} of the SBWT matrix", k-1);
        let mut labels_in = self.build_last_column(); 
        let mut labels_out = labels_in.clone();

        let mut marks = bitvec::bitvec![0; self.n_sets()];
        marks.set(0, true);

        // Iterate columns from right to left
        for round in 0..k-1 {
            if round > 0 {
                log::info!("Building column {} of the SBWT matrix", k-1-round);

                self.push_all_labels_forward(&labels_in, &mut labels_out, n_threads);
            }

            for i in 1..n_nodes {
                if labels_out[i] != labels_out[i-1] {
                    marks.set(i, true);
                }
            }

            (labels_in, labels_out) = (labels_out, labels_in);
        }

        marks
    }

    /// Compute a bit vector marking the dummy nodes in the SBWT
    pub fn compute_dummy_node_marks(&self) -> bitvec::vec::BitVec {
        let mut dummy_marks = bitvec![0; self.n_sets()];
        let mut dfs_stack = Vec::<(usize, usize)>::new(); // Node, depth
        dfs_stack.push((0,0)); // Colex rank of $, depth of $
        let mut outlabels = Vec::<u8>::new();
        while let Some((v, depth)) = dfs_stack.pop() { 
            outlabels.clear();
            self.sbwt.append_set_to_buf(v, &mut outlabels);
            dummy_marks.set(v, true);

            if depth + 1 < self.k() {
                for &c_idx in outlabels.iter() {
                    let u = self.lf_step(v, c_idx as usize);
                    dfs_stack.push((u, depth + 1));
                }
            }
        }

        dummy_marks
    } 

}


/// A table storing the SBWT intervals of all 4^p possible p-mers.
#[derive(Clone, Eq, PartialEq, Debug)]
pub struct PrefixLookupTable {
    /// ranges\[i\] is the interval of the p-mer with colexicographic rank
    /// i in the sorted list of all possible p-mers.
    /// If the p-mer does not exist in the SBWT, the range is [0..0).
    pub ranges: Vec<std::ops::Range<usize>>,

    /// Prefix length p.
    pub prefix_length: usize, 
}

impl PrefixLookupTable {

    /// Create a new prefix lookup table containing only the interval of the
    /// empty string.
    #[allow(clippy::single_range_in_vec_init)] // Clippy false positive. It's actually intended like this
    pub fn new_empty(n_sets_in_sbwt: usize) -> PrefixLookupTable {
        Self{ranges: vec![0..n_sets_in_sbwt], prefix_length: 0}
    }

    /// Create a new prefix lookup table by searching all DNA strings of length `prefix_length`
    /// in the given sbwt.
    pub fn new<SS: SubsetSeq>(sbwt: &SbwtIndex<SS>, prefix_length: usize) -> PrefixLookupTable {
        let mut pmer = vec![0u8; prefix_length];
        let mut ranges = vec![0..0; num::pow(4_usize, prefix_length)];
        for x in 0..num::pow(4, prefix_length) as u64{
            pmer.clear();

            // Construct the p-mer string
            for i in 0..prefix_length {
                let char_idx = (x >> (2*(prefix_length - 1 - i))) & 0x3;
                let c = DNA_ALPHABET[char_idx as usize];
                pmer.push(c);
            }

            if let Some(range) = sbwt.search(&pmer) {
                ranges[x as usize] = range;
            } // Else left as 0..0
        }
        PrefixLookupTable{ranges, prefix_length}
    }

    /// Look up the colex interval of the prefix. 
    pub fn lookup(&self, prefix: &[u8]) -> std::ops::Range<usize> {
        assert!(prefix.len() == self.prefix_length);
        let mut table_idx = 0_usize;
        for (i, c) in prefix.iter().rev().enumerate() {
            let char_idx = ACGT_TO_0123[*c as usize];
            if char_idx == 255 {
                return 0..0; // Not a DNA character
            }
            table_idx |= ((char_idx as u64) << (2*i)) as usize;
        }
        self.ranges[table_idx].clone()
    }

    /// Write the lookup table to the given writer.
    /// The lookup table can be then later loaded with [PrefixLookupTable::load].
    /// Returns number of bytes written. 
    pub fn serialize<W: Write>(&self, out: &mut W) -> std::io::Result<usize> {
        let mut n_written = 0_usize;
        n_written += util::write_bytes(out, &(self.prefix_length as u64).to_le_bytes())?;
        n_written += util::write_bytes(out, &(self.ranges.len() as u64).to_le_bytes())?;
        for range in self.ranges.iter(){
            n_written += util::write_bytes(out, &(range.start as u64).to_le_bytes())?;
            n_written += util::write_bytes(out, &(range.end as u64).to_le_bytes())?;
        }
        Ok(n_written)
    }

    /// Loads a a prefix lookup table that was previosly serialized with [PrefixLookupTable::serialize].
    pub fn load<R: Read>(input: &mut R) -> std::io::Result<Self> {

        let prefix_length = byteorder::ReadBytesExt::read_u64::<LittleEndian>(input).unwrap() as usize;
        let ranges_len = byteorder::ReadBytesExt::read_u64::<LittleEndian>(input).unwrap() as usize;

        let mut ranges = vec![0..0; ranges_len];
        for range in ranges.iter_mut(){
            let start = byteorder::ReadBytesExt::read_u64::<LittleEndian>(input).unwrap() as usize; 
            let end = byteorder::ReadBytesExt::read_u64::<LittleEndian>(input).unwrap() as usize; 
            *range = start..end;
        }

        Ok(Self{ranges, prefix_length})
    }
}

#[cfg(test)]
mod tests {

    

    use crate::builder::{BitPackedKmerSortingDisk, SbwtIndexBuilder};

    use crate::builder::BitPackedKmerSortingMem;

    use super::*;

    #[allow(non_snake_case)]
    fn ACGT_to_0123(c: u8) -> u8 {
        match c {
            b'A' => 0,
            b'C' => 1,
            b'G' => 2,
            b'T' => 3,
            _ => panic!("Invalid DNA character {} ", c as char),
        }
    }

    fn encode(s: &str) -> Vec<u8> {
        s.as_bytes()
            .iter()
            .map(|c| ACGT_to_0123(*c))
            .collect::<Vec<u8>>()
    }
    
    #[test_log::test]
    #[allow(non_snake_case)]
    fn doc_example() {
        // The example used in the documentation page for SbwtIndex.
        let seqs: Vec<&[u8]> = vec![b"TGTTTG", b"TTGCTAT", b"ACGTAGTATAT", b"TGTAAA"]; 
        let (sbwt, _) = SbwtIndexBuilder::<BitPackedKmerSortingDisk>::new().k(4).run_from_slices(seqs.as_slice());
        let mut doc_sbwt = vec![vec![b'A',b'T'], vec![b'C'], vec![], vec![b'A'], vec![b'T'], vec![b'T'], vec![b'A',b'G',b'T'], vec![], vec![], vec![b'G'], vec![b'T'], vec![b'T'], vec![b'T'], vec![b'T'], vec![b'C'], vec![b'G'], vec![b'A'], vec![], vec![], vec![b'A'], vec![b'A'], vec![b'A'], vec![b'A',b'T'], vec![b'T'], vec![b'G']];

        // ACGT to 0123
        for set in doc_sbwt.iter_mut() {
            for c in set.iter_mut(){
                *c = sbwt.char_idx(*c) as u8;
            }
        }
        let computed_sbwt: Vec<Vec<u8>> = (0..sbwt.n_sets()).map(|i| sbwt.sbwt.access(i)).collect();
        assert_eq!(doc_sbwt, computed_sbwt);
    }

    #[test_log::test]
    #[allow(non_snake_case)]
    fn LCS_paper_example() {

        let seqs: Vec<&[u8]> = vec![b"AGGTAAA", b"ACAGGTAGGAAAGGAAAGT"];

        let (mut sbwt, _) = SbwtIndexBuilder::<BitPackedKmerSortingDisk>::new().k(4).run_from_slices(seqs.as_slice());

        assert_eq!(sbwt.sbwt.len(), 18);

        assert_eq!(sbwt.sbwt.access(0), encode("A")); //   $$$$
        assert_eq!(sbwt.sbwt.access(1), encode("C")); //   $$$A
        assert_eq!(sbwt.sbwt.access(2), encode("G")); //   GAAA
        assert_eq!(sbwt.sbwt.access(3), encode("")); //    TAAA
        assert_eq!(sbwt.sbwt.access(4), encode("A")); //   GGAA
        assert_eq!(sbwt.sbwt.access(5), encode("A")); //   GTAA
        assert_eq!(sbwt.sbwt.access(6), encode("G")); //   $ACA
        assert_eq!(sbwt.sbwt.access(7), encode("A")); //   AGGA
        assert_eq!(sbwt.sbwt.access(8), encode("AG")); //  GGTA
        assert_eq!(sbwt.sbwt.access(9), encode("A")); //   $$AC
        assert_eq!(sbwt.sbwt.access(10), encode("GT")); // AAAG
        assert_eq!(sbwt.sbwt.access(11), encode("G")); //  ACAG
        assert_eq!(sbwt.sbwt.access(12), encode("G")); //  GTAG
        assert_eq!(sbwt.sbwt.access(13), encode("AT")); // AAGG
        assert_eq!(sbwt.sbwt.access(14), encode("")); //   CAGG
        assert_eq!(sbwt.sbwt.access(15), encode("")); //   TAGG
        assert_eq!(sbwt.sbwt.access(16), encode("")); //   AAGT
        assert_eq!(sbwt.sbwt.access(17), encode("A")); //  AGGT

        let true_padded_spectrum: Vec<&[u8]> = vec![b"$$$$", b"$$$A", b"GAAA", b"TAAA", b"GGAA", b"GTAA", b"$ACA", b"AGGA", b"GGTA", b"$$AC", b"AAAG", b"ACAG", b"GTAG", b"AAGG", b"CAGG", b"TAGG", b"AAGT", b"AGGT"];

        let reconstructed_padded_spectrum = sbwt.reconstruct_padded_spectrum(3);
        sbwt.sbwt.build_select();
        #[allow(clippy::needless_range_loop)]
        for i in 0..18 {
            eprintln!("{} {}",i, String::from_utf8_lossy(&sbwt.access_kmer(i)));
            assert_eq!(sbwt.access_kmer(i), true_padded_spectrum[i]);
            assert_eq!(&reconstructed_padded_spectrum[i*sbwt.k..(i+1)*sbwt.k], true_padded_spectrum[i]);
        }


        assert_eq!(sbwt.search(b""), Some(0..18));
        assert_eq!(sbwt.search(b"AGG"), Some(13..16));
        assert_eq!(sbwt.search(b"AAGT"), Some(16..17));

        // Test prefix looup table
        let two_mers = [b"AA", b"AC", b"AG", b"AT", b"CA", b"CC", b"CG", b"CT", b"GA", b"GC", b"GG", b"GT", b"TA", b"TC", b"TG", b"TT"];
        let lut = PrefixLookupTable::new(&sbwt, 2);
        for two_mer in two_mers {
            let I1 = match sbwt.search(two_mer){
                Some(I) => I,
                None => 0..0
            };
            let I2 = lut.lookup(two_mer);
            assert_eq!(I1, I2);
        }
    }

    #[test]
    fn serialize_and_load() {
        let seqs: Vec<&[u8]> = vec![b"AGGTAAA", b"ACAGGTAGGAAAGGAAAGT"];

        let (sbwt, _) = crate::builder::SbwtIndexBuilder::<BitPackedKmerSortingDisk>::new().k(4).run_from_slices(seqs.as_slice());

        let mut buf = Vec::<u8>::new();
        sbwt.serialize(&mut buf).unwrap();
        let sbwt2 = SbwtIndex::<SubsetMatrix>::load(&mut buf.as_slice()).unwrap();

        assert_eq!(sbwt, sbwt2);
    }

    #[test]
    #[allow(non_snake_case)]
    fn non_ACGT(){
        let seqs: Vec<&[u8]> = vec![b"AGGTAAA", b"ACAGGTAGGANAAGGAAAGT"];           
        //..................................................^...................

        let (sbwt, _) = crate::builder::SbwtIndexBuilder::<BitPackedKmerSortingDisk>::new().k(4).run_from_slices(seqs.as_slice());

        let mut buf = Vec::<u8>::new();
        sbwt.serialize(&mut buf).unwrap();
        let sbwt2 = SbwtIndex::<SubsetMatrix>::load(&mut buf.as_slice()).unwrap();

        assert_eq!(sbwt, sbwt2);
    }

    #[test]
    fn from_subset_seq() {
        let seqs: Vec<&[u8]> = vec![b"AGGTAAA", b"ACAGGTAGGAAAGGAAAGT"];
        let (sbwt_index, _) = crate::builder::SbwtIndexBuilder::<BitPackedKmerSortingDisk>::new().k(4).run_from_slices(seqs.as_slice());
        let ss = sbwt_index.sbwt().clone();
        let sbwt_index2 = SbwtIndex::<SubsetMatrix>::from_subset_seq(ss, sbwt_index.n_sets(), sbwt_index.k(), sbwt_index.prefix_lookup_table.prefix_length);

        let kmers1 = sbwt_index.reconstruct_padded_spectrum(3);
        let kmers2 = sbwt_index2.reconstruct_padded_spectrum(3);
        assert_eq!(kmers1, kmers2);

    }

    // https://stackoverflow.com/questions/52987181/how-can-i-convert-a-hex-string-to-a-u8-slice
    pub fn decode_hex(s: &str) -> Result<Vec<u8>, std::num::ParseIntError> {
        (0..s.len())
            .step_by(3)
            .map(|i| u8::from_str_radix(&s[i..i + 2], 16))
            .collect()
    }


    #[test]
    fn from_cpp_plain_matrix() {

        // C++ SBWT of {ACACTG, GCACTAA}
        // Built with ./build/bin/sbwt build -i small.fna -k 3 -p 2 -o small.sbwt

        let data_hex = concat!(
        "0c 00 00 00 00 00 00 00 70 6c 61 69 6e 2d 6d 61 ",
        "74 72 69 78 04 00 00 00 00 00 00 00 76 30 2e 31 ",
        "0a 00 00 00 00 00 00 00 70 02 00 00 00 00 00 00 ",
        "0a 00 00 00 00 00 00 00 84 00 00 00 00 00 00 00 ",
        "0a 00 00 00 00 00 00 00 01 02 00 00 00 00 00 00 ",
        "0a 00 00 00 00 00 00 00 20 00 00 00 00 00 00 00 ",
        "80 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ",
        "00 00 00 00 00 00 00 00 80 00 00 00 00 00 00 00 ",
        "00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ",
        "80 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ",
        "00 00 00 00 00 00 00 00 80 00 00 00 00 00 00 00 ",
        "00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ",
        "0a 00 00 00 00 00 00 00 f7 03 00 00 00 00 00 00 ",
        "20 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 ",
        "05 00 00 00 00 00 00 00 07 00 00 00 00 00 00 00 ",
        "09 00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 ",
        "01 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 ",
        "02 00 00 00 00 00 00 00 03 00 00 00 00 00 00 00 ",
        "ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ",
        "04 00 00 00 00 00 00 00 04 00 00 00 00 00 00 00 ",
        "05 00 00 00 00 00 00 00 05 00 00 00 00 00 00 00 ",
        "ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ",
        "06 00 00 00 00 00 00 00 06 00 00 00 00 00 00 00 ",
        "ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ",
        "ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ",
        "ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ",
        "ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ",
        "08 00 00 00 00 00 00 00 08 00 00 00 00 00 00 00 ",
        "ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ",
        "09 00 00 00 00 00 00 00 09 00 00 00 00 00 00 00 ",
        "ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ",
        "ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ",
        "02 00 00 00 00 00 00 00 0a 00 00 00 00 00 00 00 ",
        "07 00 00 00 00 00 00 00 03 00 00 00 00 00 00 00");

        let data = decode_hex(data_hex).unwrap();

        let SbwtIndexVariant::SubsetMatrix(cpp_sbwt) = load_from_cpp_plain_matrix_format(&mut std::io::Cursor::new(&data)).unwrap();

        let (rust_sbwt, _lcs) = SbwtIndexBuilder::<BitPackedKmerSortingDisk>::new().k(3).precalc_length(2).run_from_slices(&[b"ACACTG", b"GCACTAA"]);

        assert_eq!(cpp_sbwt, rust_sbwt);

        // Check also that the loading works when the plain-matrix type id is not present.

        let data_without_type_id = &data[20..]; // Skip 8 bytes of (u64 length of type id string) + 12 bytes of "plain-matrix"
        let SbwtIndexVariant::SubsetMatrix(cpp_sbwt2) = load_from_cpp_plain_matrix_format(&mut std::io::Cursor::new(data_without_type_id)).unwrap();

        assert_eq!(cpp_sbwt2, rust_sbwt);

    }

    #[test]
    fn test_push_all_labels_forward() {

        // We need a big enough input so that the work splitting in the compact version
        // of push_labels_forward has more than one independent piece.
        let input_seq = crate::util::gen_random_dna_string(2000, 1234);

        let (sbwt, _) = SbwtIndexBuilder::<BitPackedKmerSortingDisk>::new().k(5).run_from_slices(vec![input_seq.as_slice()].as_slice());

        let mut in_labels = crate::util::gen_random_dna_string(sbwt.n_sets(), 1234);
        in_labels[100] = b'$'; // Put in a dollar in the middle for fun

        let mut out_labels = vec![0_u8; sbwt.n_sets()];
        sbwt.push_all_labels_forward(&in_labels, out_labels.as_mut_slice(), 3);

        // Correct labels for the random strings with seeds 1234 (input_seq) and 1234 (in_labels)
        let correct_out_labels = b"$CGGACTATTGTGAATTCCGCGGTCT$CCGAGGTGGTGCATACGCTGTGCACTTAGAGTATTACTGGTCCGCCATACGCCAGGTGCCAATTGGTACGCGGTACTAATGCGGCCGGACGTGCTATCTAATACTACGATAACAAGGCAAATGGTTTAAACATTGTGGTATAATTATCAATTGGTTCCGAGGTTGGGTTTCGCCAACTGTCCTCGAGCTAACCGAGTGTTTCGCTATGCAATTCCCGTTC$CCGAGGGTGGTGCTATACGCTGTGCCACTTAAATGTATTCTTGGTCCGGCCATAGGAGGTGCAATTGTACGGGTGACTATGCGCGCGGACGTTGCTATGCTATATCTCTGACTTAACAGGCAAATGTAAACATTGTGGTATAATATATCAGTTGGTTCCGGAGGTTGGGTGTTTCGCCAACTGTCTCGAGCGTAAACCGATGTTTCGGACTATTGTGCAATCGTGTTCT$CCGAGGGTGGTGCAATACGCTGTGCCACAGAATGTATTCTTGTCCGGCCATACGCCGAGGTCAATTGGTAGCGGTGACAATGCGCGCCGGACGTGCTAGCTTATCTACTGACTTACAAGAGCAAATGGTTTACTTGGTAAATAATCAATTGGTCCGGAGGTGGGTTCGCCAACTTCCTGACAAACCGATGTTCGGCTTTGGCAATTCCGCGGTTCT$CGAGGGTGTGCATATCGCTGTCCCTTAGATGATACTTGGCCGGCTACGCCGAGGTGCCATGGTACCGGTGTAATGCGCGCGGAGTTCTATCTAATACTACTGACTAACAAGAGCAAATGGTTAAACATTGTGGTATAATATATCAAGTTGGTCGGAGGTTGGGTGTTCCCAACTGTCCTCGGCGTAAACCGAGTGTT".to_vec();

        assert_eq!(out_labels, correct_out_labels);

        let mut in_labels_compact = CompactIntVector::<3>::new(in_labels.len());
        for (i, c) in in_labels.iter().enumerate() {
            let c2 = match c {
                b'$' => 0,
                b'A' => 1,
                b'C' => 2,
                b'G' => 3,
                b'T' => 4,
                _ => panic!("Should be impossible")
            };
            in_labels_compact.set(i,c2);
        }

        let mut out_labels_compact = CompactIntVector::<3>::new(in_labels_compact.len());
        sbwt.push_all_labels_forward_compact(&in_labels_compact, &mut out_labels_compact, 3);

        for i in 0..out_labels_compact.len() {
            let c = match out_labels_compact.get(i) {
                0 => b'$',
                1 => b'A',
                2 => b'C',
                3 => b'G',
                4 => b'T',
                _ => panic!("Should be impossible")
            };
            assert_eq!(c, correct_out_labels[i]);
        }
    }


    ///////////////////////////////////////////////////
    //
    // Tests for bitpacked k-mer sorting in memory
    //
    ///////////////////////////////////////////////////

    #[test_log::test]
    #[allow(non_snake_case)]
    fn doc_example_mem() {
        // The example used in the documentation page for SbwtIndex.
        let seqs: Vec<&[u8]> = vec![b"TGTTTG", b"TTGCTAT", b"ACGTAGTATAT", b"TGTAAA"];
        let (sbwt, _) = SbwtIndexBuilder::<BitPackedKmerSortingMem>::new().k(4).run_from_slices(seqs.as_slice());
        let mut doc_sbwt = vec![vec![b'A',b'T'], vec![b'C'], vec![], vec![b'A'], vec![b'T'], vec![b'T'], vec![b'A',b'G',b'T'], vec![], vec![], vec![b'G'], vec![b'T'], vec![b'T'], vec![b'T'], vec![b'T'], vec![b'C'], vec![b'G'], vec![b'A'], vec![], vec![], vec![b'A'], vec![b'A'], vec![b'A'], vec![b'A',b'T'], vec![b'T'], vec![b'G']];

        // ACGT to 0123
        for set in doc_sbwt.iter_mut() {
            for c in set.iter_mut(){
                *c = sbwt.char_idx(*c) as u8;
            }
        }
        let computed_sbwt: Vec<Vec<u8>> = (0..sbwt.n_sets()).map(|i| sbwt.sbwt.access(i)).collect();
        assert_eq!(doc_sbwt, computed_sbwt);
    }

    #[test_log::test]
    #[allow(non_snake_case)]
    fn LCS_paper_example_mem() {

        let seqs: Vec<&[u8]> = vec![b"AGGTAAA", b"ACAGGTAGGAAAGGAAAGT"];

        let (mut sbwt, _) = SbwtIndexBuilder::<BitPackedKmerSortingMem>::new().k(4).run_from_slices(seqs.as_slice());

        assert_eq!(sbwt.sbwt.len(), 18);

        assert_eq!(sbwt.sbwt.access(0), encode("A")); //   $$$$
        assert_eq!(sbwt.sbwt.access(1), encode("C")); //   $$$A
        assert_eq!(sbwt.sbwt.access(2), encode("G")); //   GAAA
        assert_eq!(sbwt.sbwt.access(3), encode("")); //    TAAA
        assert_eq!(sbwt.sbwt.access(4), encode("A")); //   GGAA
        assert_eq!(sbwt.sbwt.access(5), encode("A")); //   GTAA
        assert_eq!(sbwt.sbwt.access(6), encode("G")); //   $ACA
        assert_eq!(sbwt.sbwt.access(7), encode("A")); //   AGGA
        assert_eq!(sbwt.sbwt.access(8), encode("AG")); //  GGTA
        assert_eq!(sbwt.sbwt.access(9), encode("A")); //   $$AC
        assert_eq!(sbwt.sbwt.access(10), encode("GT")); // AAAG
        assert_eq!(sbwt.sbwt.access(11), encode("G")); //  ACAG
        assert_eq!(sbwt.sbwt.access(12), encode("G")); //  GTAG
        assert_eq!(sbwt.sbwt.access(13), encode("AT")); // AAGG
        assert_eq!(sbwt.sbwt.access(14), encode("")); //   CAGG
        assert_eq!(sbwt.sbwt.access(15), encode("")); //   TAGG
        assert_eq!(sbwt.sbwt.access(16), encode("")); //   AAGT
        assert_eq!(sbwt.sbwt.access(17), encode("A")); //  AGGT

        let true_padded_spectrum: Vec<&[u8]> = vec![b"$$$$", b"$$$A", b"GAAA", b"TAAA", b"GGAA", b"GTAA", b"$ACA", b"AGGA", b"GGTA", b"$$AC", b"AAAG", b"ACAG", b"GTAG", b"AAGG", b"CAGG", b"TAGG", b"AAGT", b"AGGT"];

        let reconstructed_padded_spectrum = sbwt.reconstruct_padded_spectrum(3);
        sbwt.sbwt.build_select();
        #[allow(clippy::needless_range_loop)]
        for i in 0..18 {
            eprintln!("{} {}",i, String::from_utf8_lossy(&sbwt.access_kmer(i)));
            assert_eq!(sbwt.access_kmer(i), true_padded_spectrum[i]);
            assert_eq!(&reconstructed_padded_spectrum[i*sbwt.k..(i+1)*sbwt.k], true_padded_spectrum[i]);
        }


        assert_eq!(sbwt.search(b""), Some(0..18));
        assert_eq!(sbwt.search(b"AGG"), Some(13..16));
        assert_eq!(sbwt.search(b"AAGT"), Some(16..17));

        // Test prefix looup table
        let two_mers = [b"AA", b"AC", b"AG", b"AT", b"CA", b"CC", b"CG", b"CT", b"GA", b"GC", b"GG", b"GT", b"TA", b"TC", b"TG", b"TT"];
        let lut = PrefixLookupTable::new(&sbwt, 2);
        for two_mer in two_mers {
            let I1 = match sbwt.search(two_mer){
                Some(I) => I,
                None => 0..0
            };
            let I2 = lut.lookup(two_mer);
            assert_eq!(I1, I2);
        }
    }

    #[test]
    fn serialize_and_load_mem() {
        let seqs: Vec<&[u8]> = vec![b"AGGTAAA", b"ACAGGTAGGAAAGGAAAGT"];

        let (sbwt, _) = crate::builder::SbwtIndexBuilder::<BitPackedKmerSortingMem>::new().k(4).run_from_slices(seqs.as_slice());

        let mut buf = Vec::<u8>::new();
        sbwt.serialize(&mut buf).unwrap();
        let sbwt2 = SbwtIndex::<SubsetMatrix>::load(&mut buf.as_slice()).unwrap();

        assert_eq!(sbwt, sbwt2);
    }

    #[test]
    #[allow(non_snake_case)]
    fn non_ACGT_mem(){
        let seqs: Vec<&[u8]> = vec![b"AGGTAAA", b"ACAGGTAGGANAAGGAAAGT"];
        //..................................................^...................

        let (sbwt, _) = crate::builder::SbwtIndexBuilder::<BitPackedKmerSortingMem>::new().k(4).run_from_slices(seqs.as_slice());

        let mut buf = Vec::<u8>::new();
        sbwt.serialize(&mut buf).unwrap();
        let sbwt2 = SbwtIndex::<SubsetMatrix>::load(&mut buf.as_slice()).unwrap();

        assert_eq!(sbwt, sbwt2);
    }

    #[test]
    fn from_subset_seq_mem() {
        let seqs: Vec<&[u8]> = vec![b"AGGTAAA", b"ACAGGTAGGAAAGGAAAGT"];
        let (sbwt_index, _) = crate::builder::SbwtIndexBuilder::<BitPackedKmerSortingMem>::new().k(4).run_from_slices(seqs.as_slice());
        let ss = sbwt_index.sbwt().clone();
        let sbwt_index2 = SbwtIndex::<SubsetMatrix>::from_subset_seq(ss, sbwt_index.n_sets(), sbwt_index.k(), sbwt_index.prefix_lookup_table.prefix_length);

        let kmers1 = sbwt_index.reconstruct_padded_spectrum(3);
        let kmers2 = sbwt_index2.reconstruct_padded_spectrum(3);
        assert_eq!(kmers1, kmers2);

    }

}